31#ifndef __STDC_FORMAT_MACROS
32#define __STDC_FORMAT_MACROS
34#define VKFFT_BACKEND 0
37#if defined(VK_USE_PLATFORM_ANDROID_KHR)
38#include <android/native_activity.h>
39#include <android/asset_manager.h>
40#include <android_native_app_glue.h>
41#include <sys/system_properties.h>
44#include "vulkan/vulkan.h"
45#include "glslang_c_interface.h"
46#elif(VKFFT_BACKEND==1)
49#include <cuda_runtime.h>
50#include <cuda_runtime_api.h>
52#elif(VKFFT_BACKEND==2)
53#include <hip/hiprtc.h>
54#include <hip/hip_runtime.h>
55#include <hip/hip_runtime_api.h>
56#include <hip/hip_complex.h>
57#elif(VKFFT_BACKEND==3)
58#ifndef CL_USE_DEPRECATED_OPENCL_1_2_APIS
59#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
62#include <OpenCL/opencl.h>
86// This file is part of VkFFT, a Vulkan Fast Fourier Transform library\n\
88// Copyright (C) 2020 - present Dmitrii Tolmachev <dtolm96@gmail.com>\n\
90// Permission is hereby granted, free of charge, to any person obtaining a copy\n\
91// of this software and associated documentation files (the \"Software\"), to deal\n\
92// in the Software without restriction, including without limitation the rights\n\
93// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n\
94// copies of the Software, and to permit persons to whom the Software is\n\
95// furnished to do so, subject to the following conditions:\n\
97// The above copyright notice and this permission notice shall be included in\n\
98// all copies or substantial portions of the Software.\n\
100// THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n\
101// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n\
102// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n\
103// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n\
104// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n\
105// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN\n\
114 %s = %s;\n", out, in);
122 %s = %s;\n", out, in);
130 sdata[%s] = %s;\n",
id, in);
138 %s = sdata[%s];\n", out,
id);
146 %s = %s + %s;\n", out, in_1, in_2);
154 %s.x = %s.x + %s.x;\n\
155 %s.y = %s.y + %s.y;\n", out, in_1, in_2, out, in_1, in_2);
163 %s.x = - %s.x - %s.x;\n\
164 %s.y = - %s.y - %s.y;\n", out, in_1, in_2, out, in_1, in_2);
172 %s.x = %s.x - %s.x;\n\
173 %s.y = %s.y - %s.y;\n", out, in_1, in_2, out, in_1, in_2);
181 %s = %s - %s;\n", out, in_1, in_2);
189 %s.x = fma(%s.x, %s, %s.x);\n\
190 %s.y = fma(%s.y, %s, %s.y);\n", out, in_1, in_num, in_2, out, in_1, in_num, in_2);
198 %s = fma(%s, %s, %s);\n", out, in_1, in_num, in_2);
205 if (strcmp(out, in_1) && strcmp(out, in_2)) {
207 %s.x = %s.x * %s.x - %s.y * %s.y;\n\
208 %s.y = %s.y * %s.x + %s.x * %s.y;\n", out, in_1, in_2, in_1, in_2, out, in_1, in_2, in_1, in_2);
213 %s.x = %s.x * %s.x - %s.y * %s.y;\n\
214 %s.y = %s.y * %s.x + %s.x * %s.y;\n\
215 %s = %s;\n", temp, in_1, in_2, in_1, in_2, temp, in_1, in_2, in_1, in_2, out, temp);
226 if (strcmp(out, in_1) && strcmp(out, in_2)) {
228 %s.x = %s.x * %s.x + %s.y * %s.y;\n\
229 %s.y = %s.y * %s.x - %s.x * %s.y;\n", out, in_1, in_2, in_1, in_2, out, in_1, in_2, in_1, in_2);
234 %s.x = %s.x * %s.x + %s.y * %s.y;\n\
235 %s.y = %s.y * %s.x - %s.x * %s.y;\n\
236 %s = %s;\n", temp, in_1, in_2, in_1, in_2, temp, in_1, in_2, in_1, in_2, out, temp);
249 %s.y = %s.y * %s;\n", out, in_1, in_num, out, in_1, in_num);
256 if (strcmp(out, in_1)) {
258 %s.x = - %s.y * %s;\n\
259 %s.y = %s.x * %s;\n", out, in_1, in_num, out, in_1, in_num);
264 %s.x = - %s.y * %s;\n\
266 %s = %s;\n", temp, in_1, in_num, temp, in_1, in_num, out, temp);
279 %s.y = %s.y / %s;\n", out, in_1, in_num, out, in_1, in_num);
288 %s = %s * %s;\n", out, in_1, in_2);
296 if (strcmp(out, in_2)) {
298 %s.x = %s.x - %s.y;\n\
299 %s.y = %s.y + %s.x;\n", out, in_1, in_2, out, in_1, in_2);
304 %s.x = %s.x - %s.y;\n\
305 %s.y = %s.x + %s.y;\n\
306 %s = %s;\n", temp, in_1, in_2, temp, in_1, in_2, out, temp);
317 if (strcmp(out, in_2)) {
319 %s.x = %s.x + %s.y;\n\
320 %s.y = %s.y - %s.x;\n", out, in_1, in_2, out, in_1, in_2);
325 %s.x = %s.x + %s.y;\n\
326 %s.y = %s.x - %s.y;\n\
327 %s = %s;\n", temp, in_1, in_2, temp, in_1, in_2, out, temp);
339 %s = %s %% %s;\n", out, in_1, in_num);
347 %s = %s / %s;\n", out, in_1, in_num);
354 char temp_ID[13][20];
356 for (uint64_t i = 0; i < num_elem; i++)
357 sprintf(temp_ID[i],
"%s", sc->
locID[i]);
358 for (uint64_t i = 0; i < num_elem; i++)
359 sprintf(sc->
locID[i],
"%s", temp_ID[permute[i]]);
362 for (uint64_t i = 0; i < num_elem; i++)
363 sprintf(temp_ID[i],
"%s", regIDs[i]);
364 for (uint64_t i = 0; i < num_elem; i++)
365 sprintf(regIDs[i],
"%s", temp_ID[permute[i]]);
389 if ((!strcmp(floatType,
"double")) || (sc->
useUint64)) {
391#extension GL_ARB_gpu_shader_fp64 : enable\n\
392#extension GL_ARB_gpu_shader_int64 : enable\n\n");
396 if ((!strcmp(floatTypeInputMemory,
"half")) || (!strcmp(floatTypeOutputMemory,
"half")) || (!strcmp(floatTypeKernelMemory,
"half"))) {
397 sc->
tempLen = sprintf(sc->
tempStr,
"#extension GL_EXT_shader_16bit_storage : require\n\n");
401#elif(VKFFT_BACKEND==1)
402#elif(VKFFT_BACKEND==2)
405#include <hip/hip_runtime.h>\n");
409#elif(VKFFT_BACKEND==3)
410 if ((!strcmp(floatType,
"double")) || (sc->
useUint64)) {
412#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n\
413#pragma OPENCL EXTENSION cl_khr_int64 : enable\n\n");
426#elif(VKFFT_BACKEND==1)
427#elif(VKFFT_BACKEND==2)
428#elif(VKFFT_BACKEND==3)
435 sc->
tempLen = sprintf(sc->
tempStr,
"__constant %s %s = %s%s;\n", type, name, defaultVal, LFending);
439 sc->
tempLen = sprintf(sc->
tempStr,
"const %s %s = %s%s;\n", type, name, defaultVal, LFending);
455 for (uint64_t i = 0; i < numTab; i++)
461#elif(VKFFT_BACKEND==1)
462 sc->
tempLen = sprintf(sc->
tempStr,
"%s__syncthreads();\n\n", tabs);
465#elif(VKFFT_BACKEND==2)
466 sc->
tempLen = sprintf(sc->
tempStr,
"%s__syncthreads();\n\n", tabs);
469#elif(VKFFT_BACKEND==3)
470 sc->
tempLen = sprintf(sc->
tempStr,
"%sbarrier(CLK_LOCAL_MEM_FENCE);\n\n", tabs);
479 sc->
tempLen = sprintf(sc->
tempStr,
"layout(push_constant) uniform PushConsts\n{\n");
491#elif(VKFFT_BACKEND==1)
504 sc->
tempLen = sprintf(sc->
tempStr,
" __constant__ PushConsts consts;\n");
507#elif(VKFFT_BACKEND==2)
520 sc->
tempLen = sprintf(sc->
tempStr,
" __constant__ PushConsts consts;\n");
523#elif(VKFFT_BACKEND==3)
542 char LFending[4] =
"";
543 if (!strcmp(floatType,
"float")) sprintf(LFending,
"f");
545 if (!strcmp(floatType,
"double")) sprintf(LFending,
"LF");
546#elif(VKFFT_BACKEND==1)
547 if (!strcmp(floatType,
"double")) sprintf(LFending,
"l");
548#elif(VKFFT_BACKEND==2)
549 if (!strcmp(floatType,
"double")) sprintf(LFending,
"l");
550#elif(VKFFT_BACKEND==3)
553 res =
appendConstant(sc, floatType,
"loc_PI",
"3.1415926535897932384626433832795", LFending);
555 res =
appendConstant(sc, floatType,
"loc_SQRT1_2",
"0.70710678118654752440084436210485", LFending);
561 char functionDefinitions[100] =
"";
563 char LFending[4] =
"";
564 if (!strcmp(floatType,
"float")) sprintf(LFending,
"f");
566 if (!strcmp(floatType,
"half")) sprintf(vecType,
"f16vec2");
567 if (!strcmp(floatType,
"float")) sprintf(vecType,
"vec2");
568 if (!strcmp(floatType,
"double")) sprintf(vecType,
"dvec2");
569 if (!strcmp(floatType,
"double")) sprintf(LFending,
"LF");
570#elif(VKFFT_BACKEND==1)
571 if (!strcmp(floatType,
"half")) sprintf(vecType,
"f16vec2");
572 if (!strcmp(floatType,
"float")) sprintf(vecType,
"float2");
573 if (!strcmp(floatType,
"double")) sprintf(vecType,
"double2");
574 if (!strcmp(floatType,
"double")) sprintf(LFending,
"l");
575 sprintf(functionDefinitions,
"__device__ static __inline__ ");
576#elif(VKFFT_BACKEND==2)
577 if (!strcmp(floatType,
"half")) sprintf(vecType,
"f16vec2");
578 if (!strcmp(floatType,
"float")) sprintf(vecType,
"float2");
579 if (!strcmp(floatType,
"double")) sprintf(vecType,
"double2");
580 if (!strcmp(floatType,
"double")) sprintf(LFending,
"l");
581 sprintf(functionDefinitions,
"__device__ static __inline__ ");
582#elif(VKFFT_BACKEND==3)
583 if (!strcmp(floatType,
"half")) sprintf(vecType,
"f16vec2");
584 if (!strcmp(floatType,
"float")) sprintf(vecType,
"float2");
585 if (!strcmp(floatType,
"double")) sprintf(vecType,
"double2");
587 sprintf(functionDefinitions,
"static __inline__ ");
589 res =
appendConstant(sc, floatType,
"loc_2_PI",
"0.63661977236758134307553505349006", LFending);
591 res =
appendConstant(sc, floatType,
"loc_PI_2",
"1.5707963267948966192313216916398", LFending);
593 res =
appendConstant(sc, floatType,
"a1",
"0.99999999999999999999962122687403772", LFending);
595 res =
appendConstant(sc, floatType,
"a3",
"-0.166666666666666666637194166219637268", LFending);
597 res =
appendConstant(sc, floatType,
"a5",
"0.00833333333333333295212653322266277182", LFending);
599 res =
appendConstant(sc, floatType,
"a7",
"-0.000198412698412696489459896530659927773", LFending);
601 res =
appendConstant(sc, floatType,
"a9",
"2.75573192239364018847578909205399262e-6", LFending);
603 res =
appendConstant(sc, floatType,
"a11",
"-2.50521083781017605729370231280411712e-8", LFending);
605 res =
appendConstant(sc, floatType,
"a13",
"1.60590431721336942356660057796782021e-10", LFending);
607 res =
appendConstant(sc, floatType,
"a15",
"-7.64712637907716970380859898835680587e-13", LFending);
609 res =
appendConstant(sc, floatType,
"a17",
"2.81018528153898622636194976499656274e-15", LFending);
611 res =
appendConstant(sc, floatType,
"ab",
"-7.97989713648499642889739108679114937e-18", LFending);
614%s%s sincos_20(double x)\n\
616 //minimax coefs for sin for 0..pi/2 range\n\
617 double y = abs(x * loc_2_PI);\n\
618 double q = floor(y);\n\
619 int quadrant = int(q);\n\
620 double t = (quadrant & 1) != 0 ? 1 - y + q : y - q;\n\
622 double t2 = t * t;\n\
623 double r = fma(fma(fma(fma(fma(fma(fma(fma(fma(ab, t2, a17), t2, a15), t2, a13), t2, a11), t2, a9), t2, a7), t2, a5), t2, a3), t2 * t, t);\n\
625 cos_sin.x = ((quadrant == 0) || (quadrant == 3)) ? sqrt(1 - r * r) : -sqrt(1 - r * r);\n\
626 r = x < 0 ? -r : r;\n\
627 cos_sin.y = (quadrant & 2) != 0 ? -r : r;\n\
629}\n\n", functionDefinitions, vecType, vecType);
637 char functionDefinitions[100] =
"";
639 char vecTypeDifferent[30];
642#elif(VKFFT_BACKEND==1)
643 sprintf(functionDefinitions,
"__device__ static __inline__ ");
644#elif(VKFFT_BACKEND==2)
645 sprintf(functionDefinitions,
"__device__ static __inline__ ");
646#elif(VKFFT_BACKEND==3)
647 sprintf(functionDefinitions,
"static __inline__ ");
650 if (!strcmp(floatType,
"half")) sprintf(vecType,
"f16vec2");
651 if (!strcmp(floatType,
"float")) sprintf(vecType,
"float2");
652 if (!strcmp(floatType,
"double")) sprintf(vecType,
"double2");
653 if (!strcmp(floatTypeDifferent,
"half")) sprintf(vecTypeDifferent,
"f16vec2");
654 if (!strcmp(floatTypeDifferent,
"float")) sprintf(vecTypeDifferent,
"float2");
655 if (!strcmp(floatTypeDifferent,
"double")) sprintf(vecTypeDifferent,
"double2");
657%s%s conv_%s(%s input)\n\
660 ret_val.x = (%s) input.x;\n\
661 ret_val.y = (%s) input.y;\n\
663}\n\n", functionDefinitions, vecType, vecType, vecTypeDifferent, vecType, floatType, floatType);
667%s%s conv_%s(%s input)\n\
670 ret_val.x = (%s) input.x;\n\
671 ret_val.y = (%s) input.y;\n\
673}\n\n", functionDefinitions, vecTypeDifferent, vecTypeDifferent, vecType, vecTypeDifferent, floatTypeDifferent, floatTypeDifferent);
683 case 0:
case 1:
case 2:
case 3:
case 4:
case 6: {
685 if (!strcmp(floatTypeMemory,
"half")) {
687 sprintf(vecType,
"f16vec2");
689 if (!strcmp(floatTypeMemory,
"float")) {
691 sprintf(vecType,
"vec2");
693 if (!strcmp(floatTypeMemory,
"double")) {
695 sprintf(vecType,
"dvec2");
699layout(std430, binding = %" PRIu64
") buffer DataIn{\n\
700 %s inputs[%" PRIu64
"];\n\
707layout(std430, binding = %" PRIu64
") buffer DataIn{\n\
708 %s inputs[%" PRIu64
"];\n\
713#elif(VKFFT_BACKEND==1)
714 if (!strcmp(floatTypeMemory,
"half")) {
716 sprintf(vecType,
"f16vec2");
718 if (!strcmp(floatTypeMemory,
"float")) {
720 sprintf(vecType,
"float2");
722 if (!strcmp(floatTypeMemory,
"double")) {
724 sprintf(vecType,
"double2");
726#elif(VKFFT_BACKEND==2)
727 if (!strcmp(floatTypeMemory,
"half")) {
729 sprintf(vecType,
"f16vec2");
731 if (!strcmp(floatTypeMemory,
"float")) {
733 sprintf(vecType,
"float2");
735 if (!strcmp(floatTypeMemory,
"double")) {
737 sprintf(vecType,
"double2");
739#elif(VKFFT_BACKEND==3)
740 if (!strcmp(floatTypeMemory,
"half")) {
742 sprintf(vecType,
"f16vec2");
744 if (!strcmp(floatTypeMemory,
"float")) {
746 sprintf(vecType,
"float2");
748 if (!strcmp(floatTypeMemory,
"double")) {
750 sprintf(vecType,
"double2");
755 case 5:
case 110:
case 111:
case 120:
case 121:
case 130:
case 131:
case 140:
case 141:
case 142:
case 143:
case 144:
case 145:
757 if (!strcmp(floatTypeMemory,
"half")) {
759 sprintf(vecType,
"float16_t");
761 if (!strcmp(floatTypeMemory,
"float")) {
763 sprintf(vecType,
"float");
765 if (!strcmp(floatTypeMemory,
"double")) {
767 sprintf(vecType,
"double");
772layout(std430, binding = %" PRIu64
") buffer DataIn{\n\
773 %s inputs[%" PRIu64
"];\n\
780layout(std430, binding = %" PRIu64
") buffer DataIn{\n\
781 %s inputs[%" PRIu64
"];\n\
795 switch (outputType) {
796 case 0:
case 1:
case 2:
case 3:
case 4:
case 5: {
798 if (!strcmp(floatTypeMemory,
"half")) {
800 sprintf(vecType,
"f16vec2");
802 if (!strcmp(floatTypeMemory,
"float")) {
804 sprintf(vecType,
"vec2");
806 if (!strcmp(floatTypeMemory,
"double")) {
808 sprintf(vecType,
"dvec2");
812layout(std430, binding = %" PRIu64
") buffer DataOut{\n\
813 %s outputs[%" PRIu64
"];\n\
820layout(std430, binding = %" PRIu64
") buffer DataOut{\n\
821 %s outputs[%" PRIu64
"];\n\
826#elif(VKFFT_BACKEND==1)
827 if (!strcmp(floatTypeMemory,
"half")) {
829 sprintf(vecType,
"f16vec2");
831 if (!strcmp(floatTypeMemory,
"float")) {
833 sprintf(vecType,
"float2");
835 if (!strcmp(floatTypeMemory,
"double")) {
837 sprintf(vecType,
"double2");
839#elif(VKFFT_BACKEND==2)
840 if (!strcmp(floatTypeMemory,
"half")) {
842 sprintf(vecType,
"f16vec2");
844 if (!strcmp(floatTypeMemory,
"float")) {
846 sprintf(vecType,
"float2");
848 if (!strcmp(floatTypeMemory,
"double")) {
850 sprintf(vecType,
"double2");
852#elif(VKFFT_BACKEND==3)
853 if (!strcmp(floatTypeMemory,
"half")) {
855 sprintf(vecType,
"f16vec2");
857 if (!strcmp(floatTypeMemory,
"float")) {
859 sprintf(vecType,
"float2");
861 if (!strcmp(floatTypeMemory,
"double")) {
863 sprintf(vecType,
"double2");
868 case 6:
case 110:
case 111:
case 120:
case 121:
case 130:
case 131:
case 140:
case 141:
case 142:
case 143:
case 144:
case 145:
870 if (!strcmp(floatTypeMemory,
"half")) {
872 sprintf(vecType,
"float16_t");
874 if (!strcmp(floatTypeMemory,
"float")) {
876 sprintf(vecType,
"float");
878 if (!strcmp(floatTypeMemory,
"double")) {
880 sprintf(vecType,
"double");
885layout(std430, binding = %" PRIu64
") buffer DataOut{\n\
886 %s outputs[%" PRIu64
"];\n\
893layout(std430, binding = %" PRIu64
") buffer DataOut{\n\
894 %s outputs[%" PRIu64
"];\n\
909 if (!strcmp(floatTypeMemory,
"half")) {
911 sprintf(vecType,
"f16vec2");
913 if (!strcmp(floatTypeMemory,
"float")) {
915 sprintf(vecType,
"vec2");
917 if (!strcmp(floatTypeMemory,
"double")) {
919 sprintf(vecType,
"dvec2");
923layout(std430, binding = %" PRIu64
") buffer Kernel_FFT{\n\
924 %s kernel_obj[%" PRIu64
"];\n\
931layout(std430, binding = %" PRIu64
") buffer Kernel_FFT{\n\
932 %s kernel_obj[%" PRIu64
"];\n\
937#elif(VKFFT_BACKEND==1)
938 if (!strcmp(floatTypeMemory,
"half")) {
940 sprintf(vecType,
"f16vec2");
942 if (!strcmp(floatTypeMemory,
"float")) {
944 sprintf(vecType,
"float2");
946 if (!strcmp(floatTypeMemory,
"double")) {
948 sprintf(vecType,
"double2");
950#elif(VKFFT_BACKEND==2)
951 if (!strcmp(floatTypeMemory,
"half")) {
953 sprintf(vecType,
"f16vec2");
955 if (!strcmp(floatTypeMemory,
"float")) {
957 sprintf(vecType,
"float2");
959 if (!strcmp(floatTypeMemory,
"double")) {
961 sprintf(vecType,
"double2");
963#elif(VKFFT_BACKEND==3)
964 if (!strcmp(floatTypeMemory,
"half")) {
966 sprintf(vecType,
"f16vec2");
968 if (!strcmp(floatTypeMemory,
"float")) {
970 sprintf(vecType,
"float2");
972 if (!strcmp(floatTypeMemory,
"double")) {
974 sprintf(vecType,
"double2");
983 if (!strcmp(floatType,
"float")) sprintf(vecType,
"vec2");
984 if (!strcmp(floatType,
"double")) sprintf(vecType,
"dvec2");
986layout(std430, binding = %" PRIu64
") readonly buffer DataLUT {\n\
991#elif(VKFFT_BACKEND==1)
992 if (!strcmp(floatType,
"float")) sprintf(vecType,
"float2");
993 if (!strcmp(floatType,
"double")) sprintf(vecType,
"double2");
994#elif(VKFFT_BACKEND==2)
995 if (!strcmp(floatType,
"float")) sprintf(vecType,
"float2");
996 if (!strcmp(floatType,
"double")) sprintf(vecType,
"double2");
997#elif(VKFFT_BACKEND==3)
998 if (!strcmp(floatType,
"float")) sprintf(vecType,
"float2");
999 if (!strcmp(floatType,
"double")) sprintf(vecType,
"double2");
1006 uint64_t loc_id = id;
1007#if(VKFFT_BACKEND==0)
1008 if (!strcmp(floatType,
"float")) sprintf(vecType,
"vec2");
1009 if (!strcmp(floatType,
"double")) sprintf(vecType,
"dvec2");
1012layout(std430, binding = %" PRIu64
") readonly buffer DataBluesteinConvolutionKernel {\n\
1013%s BluesteinConvolutionKernel[];\n\
1014};\n", loc_id, vecType);
1021layout(std430, binding = %" PRIu64
") readonly buffer DataBluesteinMultiplication {\n\
1022%s BluesteinMultiplication[];\n\
1023};\n", loc_id, vecType);
1028#elif(VKFFT_BACKEND==1)
1029 if (!strcmp(floatType,
"float")) sprintf(vecType,
"float2");
1030 if (!strcmp(floatType,
"double")) sprintf(vecType,
"double2");
1031#elif(VKFFT_BACKEND==2)
1032 if (!strcmp(floatType,
"float")) sprintf(vecType,
"float2");
1033 if (!strcmp(floatType,
"double")) sprintf(vecType,
"double2");
1034#elif(VKFFT_BACKEND==3)
1035 if (!strcmp(floatType,
"float")) sprintf(vecType,
"float2");
1036 if (!strcmp(floatType,
"double")) sprintf(vecType,
"double2");
1042 switch (inputType) {
1043 case 0:
case 2:
case 3:
case 4:
case 5:
case 6:
case 110:
case 120:
case 130:
case 140:
case 142:
case 144: {
1044 char inputOffset[30] =
"";
1047 char shiftX[500] =
"";
1049 sprintf(shiftX,
"(%s)", index_x);
1051 sprintf(shiftX,
"(%s) * %" PRIu64
"", index_x, sc->
inputStride[0]);
1052 char shiftY[500] =
"";
1054 if (sc->
size[1] > 1) {
1076 char shiftZ[500] =
"";
1077 if (sc->
size[2] > 1) {
1091 char shiftCoordinate[500] =
"";
1098 sprintf(shiftCoordinate,
" + %s * %" PRIu64
"", coordinate, sc->
inputStride[3]);
1100 char shiftBatch[500] =
"";
1103 sprintf(shiftBatch,
" + %s * %" PRIu64
"", batchID, sc->
inputStride[4]);
1108 sc->
tempLen = sprintf(sc->
tempStr,
"%s%s%s%s%s%s", inputOffset, shiftX, shiftY, shiftZ, shiftCoordinate, shiftBatch);
1113 case 1:
case 111:
case 121:
case 131:
case 141:
case 143:
case 145: {
1114 char inputOffset[30] =
"";
1117 char shiftX[500] =
"";
1119 sprintf(shiftX,
"(%s)", index_x);
1121 sprintf(shiftX,
"(%s) * %" PRIu64
"", index_x, sc->
inputStride[0]);
1123 char shiftY[500] =
"";
1125 sprintf(shiftY,
" + (%s) * %" PRIu64
"", index_y, sc->
inputStride[1]);
1127 char shiftZ[500] =
"";
1128 if (sc->
size[2] > 1) {
1142 char shiftCoordinate[500] =
"";
1149 sprintf(shiftCoordinate,
" + %s * %" PRIu64
"", coordinate, sc->
inputStride[3]);
1151 char shiftBatch[500] =
"";
1154 sprintf(shiftBatch,
" + %s * %" PRIu64
"", batchID, sc->
inputStride[4]);
1159 sc->
tempLen = sprintf(sc->
tempStr,
"%s%s%s%s%s%s", inputOffset, shiftX, shiftY, shiftZ, shiftCoordinate, shiftBatch);
1169 switch (outputType) {
1170 case 0:
case 2:
case 3:
case 4:
case 5:
case 6:
case 110:
case 120:
case 130:
case 140:
case 142:
case 144: {
1171 char outputOffset[30] =
"";
1174 char shiftX[500] =
"";
1176 sprintf(shiftX,
"(%s)", index_x);
1178 sprintf(shiftX,
"(%s) * %" PRIu64
"", index_x, sc->
outputStride[0]);
1179 char shiftY[500] =
"";
1181 if (sc->
size[1] > 1) {
1203 char shiftZ[500] =
"";
1204 if (sc->
size[2] > 1) {
1218 char shiftCoordinate[500] =
"";
1225 sprintf(shiftCoordinate,
" + %s * %" PRIu64
"", coordinate, sc->
outputStride[3]);
1227 char shiftBatch[500] =
"";
1230 sprintf(shiftBatch,
" + %s * %" PRIu64
"", batchID, sc->
outputStride[4]);
1235 sc->
tempLen = sprintf(sc->
tempStr,
"%s%s%s%s%s%s", outputOffset, shiftX, shiftY, shiftZ, shiftCoordinate, shiftBatch);
1240 case 1:
case 111:
case 121:
case 131:
case 141:
case 143:
case 145: {
1241 char outputOffset[30] =
"";
1244 char shiftX[500] =
"";
1246 sprintf(shiftX,
"(%s)", index_x);
1248 sprintf(shiftX,
"(%s) * %" PRIu64
"", index_x, sc->
outputStride[0]);
1249 char shiftY[500] =
"";
1251 sprintf(shiftY,
" + (%s) * %" PRIu64
"", index_y, sc->
outputStride[1]);
1252 char shiftZ[500] =
"";
1253 if (sc->
size[2] > 1) {
1267 char shiftCoordinate[500] =
"";
1274 sprintf(shiftCoordinate,
" + %s * %" PRIu64
"", coordinate, sc->
outputStride[3]);
1276 char shiftBatch[500] =
"";
1279 sprintf(shiftBatch,
" + %s * %" PRIu64
"", batchID, sc->
outputStride[4]);
1284 sc->
tempLen = sprintf(sc->
tempStr,
"%s%s%s%s%s%s", outputOffset, shiftX, shiftY, shiftZ, shiftCoordinate, shiftBatch);
1297 char LFending[4] =
"";
1298 if (!strcmp(floatType,
"float")) sprintf(LFending,
"f");
1299#if(VKFFT_BACKEND==0)
1300 if (!strcmp(floatType,
"float")) sprintf(vecType,
"vec2");
1301 if (!strcmp(floatType,
"double")) sprintf(vecType,
"dvec2");
1302 char cosDef[20] =
"cos";
1303 char sinDef[20] =
"sin";
1304 if (!strcmp(floatType,
"double")) sprintf(LFending,
"LF");
1305#elif(VKFFT_BACKEND==1)
1306 if (!strcmp(floatType,
"float")) sprintf(vecType,
"float2");
1307 if (!strcmp(floatType,
"double")) sprintf(vecType,
"double2");
1308 char cosDef[20] =
"__cosf";
1309 char sinDef[20] =
"__sinf";
1310 if (!strcmp(floatType,
"double")) sprintf(LFending,
"l");
1311#elif(VKFFT_BACKEND==2)
1312 if (!strcmp(floatType,
"float")) sprintf(vecType,
"float2");
1313 if (!strcmp(floatType,
"double")) sprintf(vecType,
"double2");
1314 char cosDef[20] =
"__cosf";
1315 char sinDef[20] =
"__sinf";
1316 if (!strcmp(floatType,
"double")) sprintf(LFending,
"l");
1317#elif(VKFFT_BACKEND==3)
1318 if (!strcmp(floatType,
"float")) sprintf(vecType,
"float2");
1319 if (!strcmp(floatType,
"double")) sprintf(vecType,
"double2");
1320 char cosDef[20] =
"native_cos";
1321 char sinDef[20] =
"native_sin";
1324 char* temp = sc->
temp;
1330 char convolutionInverse[30] =
"";
1331 if (sc->
convolutionStep) sprintf(convolutionInverse,
", %s inverse", uintType);
1357 if (!strcmp(floatType,
"float")) {
1358 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = %s(angle);\n",
w, cosDef);
1361 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = %s(angle);\n",
w, sinDef);
1365 if (!strcmp(floatType,
"double")) {
1395 for (uint64_t i = 0; i < 2; i++) {
1396 tf[i] = (
char*)malloc(
sizeof(
char) * 50);
1398 for (uint64_t j = 0; j < i; j++) {
1406 sprintf(tf[0],
"-0.5%s", LFending);
1407 sprintf(tf[1],
"-0.8660254037844386467637231707529%s", LFending);
1427 if (!strcmp(floatType,
"float")) {
1428 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = %s(angle*%.17f%s);\n",
w, cosDef, 4.0 / 3.0, LFending);
1431 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = %s(angle*%.17f%s);\n",
w, sinDef, 4.0 / 3.0, LFending);
1436 if (!strcmp(floatType,
"double")) {
1437 sc->
tempLen = sprintf(sc->
tempStr,
" %s = sincos_20(angle*%.17f%s);\n",
w, 4.0 / 3.0, LFending);
1447 sc->
tempLen = sprintf(sc->
tempStr,
" %s = twiddleLUT[LUTId+%" PRIu64
"];\n",
w, stageSize);
1457 if (!strcmp(floatType,
"float")) {
1458 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = %s(angle*%.17f%s);\n",
w, cosDef, 2.0 / 3.0, LFending);
1461 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = %s(angle*%.17f%s);\n",
w, sinDef, 2.0 / 3.0, LFending);
1466 if (!strcmp(floatType,
"double")) {
1467 sc->
tempLen = sprintf(sc->
tempStr,
" %s=sincos_20(angle*%.17f%s);\n",
w, 2.0 / 3.0, LFending);
1523 for (uint64_t i = 0; i < 2; i++) {
1552 if (!strcmp(floatType,
"float")) {
1553 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = %s(angle);\n",
w, cosDef);
1556 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = %s(angle);\n",
w, sinDef);
1560 if (!strcmp(floatType,
"double")) {
1589 sc->
tempLen = sprintf(sc->
tempStr,
" %s=twiddleLUT[LUTId+%" PRIu64
"];\n",
w, stageSize);
1599 if (!strcmp(floatType,
"float")) {
1600 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = %s(0.5%s*angle);\n",
w, cosDef, LFending);
1603 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = %s(0.5%s*angle);\n",
w, sinDef, LFending);
1607 if (!strcmp(floatType,
"double")) {
1608 sc->
tempLen = sprintf(sc->
tempStr,
" %s=normalize(%s + %s(1.0, 0.0));\n",
w,
w, vecType);
1624 if (stageAngle < 0) {
1681 for (uint64_t i = 0; i < 5; i++) {
1682 tf[i] = (
char*)malloc(
sizeof(
char) * 50);
1684 for (uint64_t j = 0; j < i; j++) {
1691 sprintf(tf[0],
"-0.5%s", LFending);
1692 sprintf(tf[1],
"1.538841768587626701285145288018455%s", LFending);
1693 sprintf(tf[2],
"-0.363271264002680442947733378740309%s", LFending);
1694 sprintf(tf[3],
"-0.809016994374947424102293417182819%s", LFending);
1695 sprintf(tf[4],
"-0.587785252292473129168705954639073%s", LFending);
1706 for (uint64_t i = radix - 1; i > 0; i--) {
1707 if (i == radix - 1) {
1719 if (!strcmp(floatType,
"float")) {
1720 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = %s(angle*%.17f%s);\n",
w, cosDef, 2.0 * i / radix, LFending);
1723 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = %s(angle*%.17f%s);\n",
w, sinDef, 2.0 * i / radix, LFending);
1728 if (!strcmp(floatType,
"double")) {
1729 sc->
tempLen = sprintf(sc->
tempStr,
" %s = sincos_20(angle*%.17f%s);\n",
w, 2.0 * i / radix, LFending);
1737 sc->
tempLen = sprintf(sc->
tempStr,
" %s = twiddleLUT[LUTId+%" PRIu64
"];\n",
w, (radix - 1 - i) * stageSize);
1747 if (!strcmp(floatType,
"float")) {
1748 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = %s(angle*%.17f%s);\n",
w, cosDef, 2.0 * i / radix, LFending);
1751 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = %s(angle*%.17f%s);\n",
w, sinDef, 2.0 * i / radix, LFending);
1756 if (!strcmp(floatType,
"double")) {
1757 sc->
tempLen = sprintf(sc->
tempStr,
" %s = sincos_20(angle*%.17f%s);\n",
w, 2.0 * i / radix, LFending);
1870 for (uint64_t i = 0; i < 5; i++) {
1887 for (uint64_t i = 0; i < 8; i++) {
1888 tf[i] = (
char*)malloc(
sizeof(
char) * 50);
1890 for (uint64_t j = 0; j < i; j++) {
1897 sprintf(tf[0],
"-1.16666666666666651863693004997913%s", LFending);
1898 sprintf(tf[1],
"0.79015646852540022404554065360571%s", LFending);
1899 sprintf(tf[2],
"0.05585426728964774240049351305970%s", LFending);
1900 sprintf(tf[3],
"0.73430220123575240531721419756650%s", LFending);
1901 if (stageAngle < 0) {
1902 sprintf(tf[4],
"0.44095855184409837868031445395900%s", LFending);
1903 sprintf(tf[5],
"0.34087293062393136944265847887436%s", LFending);
1904 sprintf(tf[6],
"-0.53396936033772524066165487965918%s", LFending);
1905 sprintf(tf[7],
"0.87484229096165666561546458979137%s", LFending);
1908 sprintf(tf[4],
"-0.44095855184409837868031445395900%s", LFending);
1909 sprintf(tf[5],
"-0.34087293062393136944265847887436%s", LFending);
1910 sprintf(tf[6],
"0.53396936033772524066165487965918%s", LFending);
1911 sprintf(tf[7],
"-0.87484229096165666561546458979137%s", LFending);
1920 for (uint64_t i = radix - 1; i > 0; i--) {
1921 if (i == radix - 1) {
1933 if (!strcmp(floatType,
"float")) {
1934 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = %s(angle*%.17f%s);\n",
w, cosDef, 2.0 * i / radix, LFending);
1937 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = %s(angle*%.17f%s);\n",
w, sinDef, 2.0 * i / radix, LFending);
1942 if (!strcmp(floatType,
"double")) {
1943 sc->
tempLen = sprintf(sc->
tempStr,
" %s = sincos_20(angle*%.17f%s);\n",
w, 2.0 * i / radix, LFending);
1951 sc->
tempLen = sprintf(sc->
tempStr,
" %s = twiddleLUT[LUTId+%" PRIu64
"];\n\n",
w, (radix - 1 - i) * stageSize);
1961 if (!strcmp(floatType,
"float")) {
1962 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = %s(angle*%.17f%s);\n",
w, cosDef, 2.0 * i / radix, LFending);
1965 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = %s(angle*%.17f%s);\n",
w, sinDef, 2.0 * i / radix, LFending);
1970 if (!strcmp(floatType,
"double")) {
1971 sc->
tempLen = sprintf(sc->
tempStr,
" %s = sincos_20(angle*%.17f%s);\n",
w, 2.0 * i / radix, LFending);
2141 for (uint64_t i = 0; i < 8; i++) {
2171 if (!strcmp(floatType,
"float")) {
2172 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = %s(angle);\n",
w, cosDef);
2175 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = %s(angle);\n",
w, sinDef);
2179 if (!strcmp(floatType,
"double")) {
2185 for (uint64_t i = 0; i < 4; i++) {
2199 sc->
tempLen = sprintf(sc->
tempStr,
" %s=twiddleLUT[LUTId+%" PRIu64
"];\n\n",
w, stageSize);
2209 if (!strcmp(floatType,
"float")) {
2210 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = %s(0.5%s*angle);\n",
w, cosDef, LFending);
2213 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = %s(0.5%s*angle);\n",
w, sinDef, LFending);
2217 if (!strcmp(floatType,
"double")) {
2218 sc->
tempLen = sprintf(sc->
tempStr,
" %s=normalize(%s + %s(1.0, 0.0));\n",
w,
w, vecType);
2223 for (uint64_t i = 0; i < 2; i++) {
2236 if (stageAngle < 0) {
2255 for (uint64_t i = 4; i < 6; i++) {
2270 sc->
tempLen = sprintf(sc->
tempStr,
" %s=twiddleLUT[LUTId+%" PRIu64
"];\n\n",
w, 2 * stageSize);
2280 if (!strcmp(floatType,
"float")) {
2281 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = %s(0.25%s*angle);\n",
w, cosDef, LFending);
2284 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = %s(0.25%s*angle);\n",
w, sinDef, LFending);
2289 if (!strcmp(floatType,
"double")) {
2290 sc->
tempLen = sprintf(sc->
tempStr,
" %s=normalize(%s + %s(1.0, 0.0));\n",
w,
w, vecType);
2306 if (stageAngle < 0) {
2335 if (stageAngle < 0) {
2336 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = %s.x * loc_SQRT1_2 + %s.y * loc_SQRT1_2;\n", iw,
w,
w);
2339 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = %s.y * loc_SQRT1_2 - %s.x * loc_SQRT1_2;\n\n", iw,
w,
w);
2344 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = %s.x * loc_SQRT1_2 - %s.y * loc_SQRT1_2;\n", iw,
w,
w);
2347 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = %s.y * loc_SQRT1_2 + %s.x * loc_SQRT1_2;\n\n", iw,
w,
w);
2362 if (stageAngle < 0) {
2420 for (uint64_t i = 0; i < 20; i++) {
2421 tf[i] = (
char*)malloc(
sizeof(
char) * 50);
2423 for (uint64_t j = 0; j < i; j++) {
2432 sprintf(tf[0],
"-1.100000000000000%s", LFending);
2434 sprintf(tf[2],
"0.253097611605959%s", LFending);
2435 sprintf(tf[3],
"-1.288200610773679%s", LFending);
2436 sprintf(tf[4],
"0.304632239669212%s", LFending);
2437 sprintf(tf[5],
"-0.391339615511917%s", LFending);
2438 sprintf(tf[6],
"-2.871022253392850%s", LFending);
2439 sprintf(tf[7],
"1.374907986616384%s", LFending);
2440 sprintf(tf[8],
"0.817178135341212%s", LFending);
2441 sprintf(tf[9],
"1.800746506445679%s", LFending);
2442 sprintf(tf[10],
"-0.859492973614498%s", LFending);
2444 if (stageAngle < 0) {
2445 sprintf(tf[1],
"0.331662479035540%s", LFending);
2446 sprintf(tf[11],
"-2.373470454748280%s", LFending);
2447 sprintf(tf[12],
"-0.024836393087493%s", LFending);
2448 sprintf(tf[13],
"0.474017017512829%s", LFending);
2449 sprintf(tf[14],
"0.742183927770612%s", LFending);
2450 sprintf(tf[15],
"1.406473309094609%s", LFending);
2451 sprintf(tf[16],
"-1.191364552195948%s", LFending);
2452 sprintf(tf[17],
"0.708088885039503%s", LFending);
2453 sprintf(tf[18],
"0.258908260614168%s", LFending);
2454 sprintf(tf[19],
"-0.049929922194110%s", LFending);
2457 sprintf(tf[1],
"-0.331662479035540%s", LFending);
2458 sprintf(tf[11],
"2.373470454748280%s", LFending);
2459 sprintf(tf[12],
"0.024836393087493%s", LFending);
2460 sprintf(tf[13],
"-0.474017017512829%s", LFending);
2461 sprintf(tf[14],
"-0.742183927770612%s", LFending);
2462 sprintf(tf[15],
"-1.406473309094609%s", LFending);
2463 sprintf(tf[16],
"1.191364552195948%s", LFending);
2464 sprintf(tf[17],
"-0.708088885039503%s", LFending);
2465 sprintf(tf[18],
"-0.258908260614168%s", LFending);
2466 sprintf(tf[19],
"0.049929922194110%s", LFending);
2468 for (uint64_t i = radix - 1; i > 0; i--) {
2469 if (i == radix - 1) {
2481 if (!strcmp(floatType,
"float")) {
2482 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = %s(angle*%.17f%s);\n",
w, cosDef, 2.0 * i / radix, LFending);
2485 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = %s(angle*%.17f%s);\n",
w, sinDef, 2.0 * i / radix, LFending);
2490 if (!strcmp(floatType,
"double")) {
2491 sc->
tempLen = sprintf(sc->
tempStr,
" %s = sincos_20(angle*%.17f%s);\n",
w, 2.0 * i / radix, LFending);
2499 sc->
tempLen = sprintf(sc->
tempStr,
" %s = twiddleLUT[LUTId+%" PRIu64
"];\n\n",
w, (radix - 1 - i) * stageSize);
2509 if (!strcmp(floatType,
"float")) {
2510 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = %s(angle*%.17f%s);\n",
w, cosDef, 2.0 * i / radix, LFending);
2513 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = %s(angle*%.17f%s);\n",
w, sinDef, 2.0 * i / radix, LFending);
2518 if (!strcmp(floatType,
"double")) {
2519 sc->
tempLen = sprintf(sc->
tempStr,
" %s = sincos_20(angle*%.17f%s);\n",
w, 2.0 * i / radix, LFending);
2530 uint64_t permute[11] = { 0,1,9,4,3,5,10,2,7,8,6 };
2533 for (uint64_t i = 0; i < 5; i++) {
2541 for (uint64_t i = 0; i < 4; i++) {
2549 for (uint64_t i = 0; i < 4; i++) {
2562 for (uint64_t k = 0; k < 2; k++) {
2630 res =
VkAddComplex(sc, regID[k * 4 + 3], sc->
locID[k * 4 + 3], regID[k * 4 + 5]);
2632 res =
VkAddComplex(sc, regID[k * 4 + 4], sc->
locID[k * 4 + 4], regID[k * 4 + 5]);
2635 res =
VkAddComplex(sc, regID[k * 4 + 5], sc->
locID[k * 4 + 5], regID[k * 4 + 6]);
2637 res =
VkAddComplex(sc, regID[k * 4 + 6], sc->
locID[k * 4 + 6], regID[k * 4 + 6]);
2646 for (uint64_t i = 0; i < 4; i++) {
2654 for (uint64_t i = 0; i < 4; i++) {
2660 for (uint64_t i = 0; i < 5; i++) {
2666 uint64_t permute2[11] = { 0,10,1,8,7,9,4,2,3,6,5 };
2667 res =
VkPermute(sc, permute2, 11, 1, regID);
2670 for (uint64_t i = 0; i < 20; i++) {
2682 for (uint64_t i = 0; i < 20; i++) {
2683 tf[i] = (
char*)malloc(
sizeof(
char) * 50);
2685 for (uint64_t j = 0; j < i; j++) {
2694 sprintf(tf[0],
"-1.083333333333333%s", LFending);
2695 sprintf(tf[1],
"-0.300462606288666%s", LFending);
2696 sprintf(tf[5],
"1.007074065727533%s", LFending);
2697 sprintf(tf[6],
"0.731245990975348%s", LFending);
2698 sprintf(tf[7],
"-0.579440018900960%s", LFending);
2699 sprintf(tf[8],
"0.531932498429674%s", LFending);
2700 sprintf(tf[9],
"-0.508814921720398%s", LFending);
2701 sprintf(tf[10],
"-0.007705858903092%s", LFending);
2703 if (stageAngle < 0) {
2704 sprintf(tf[2],
"-0.749279330626139%s", LFending);
2705 sprintf(tf[3],
"0.401002128321867%s", LFending);
2706 sprintf(tf[4],
"0.174138601152136%s", LFending);
2707 sprintf(tf[11],
"-2.511393318389568%s", LFending);
2708 sprintf(tf[12],
"-1.823546408682421%s", LFending);
2709 sprintf(tf[13],
"1.444979909023996%s", LFending);
2710 sprintf(tf[14],
"-1.344056915177370%s", LFending);
2711 sprintf(tf[15],
"-0.975932420775946%s", LFending);
2712 sprintf(tf[16],
"0.773329778651105%s", LFending);
2713 sprintf(tf[17],
"1.927725116783469%s", LFending);
2714 sprintf(tf[18],
"1.399739414729183%s", LFending);
2715 sprintf(tf[19],
"-1.109154843837551%s", LFending);
2718 sprintf(tf[2],
"0.749279330626139%s", LFending);
2719 sprintf(tf[3],
"-0.401002128321867%s", LFending);
2720 sprintf(tf[4],
"-0.174138601152136%s", LFending);
2721 sprintf(tf[11],
"2.511393318389568%s", LFending);
2722 sprintf(tf[12],
"1.823546408682421%s", LFending);
2723 sprintf(tf[13],
"-1.444979909023996%s", LFending);
2724 sprintf(tf[14],
"1.344056915177370%s", LFending);
2725 sprintf(tf[15],
"0.975932420775946%s", LFending);
2726 sprintf(tf[16],
"-0.773329778651105%s", LFending);
2727 sprintf(tf[17],
"-1.927725116783469%s", LFending);
2728 sprintf(tf[18],
"-1.399739414729183%s", LFending);
2729 sprintf(tf[19],
"1.109154843837551%s", LFending);
2731 for (uint64_t i = radix - 1; i > 0; i--) {
2732 if (i == radix - 1) {
2744 if (!strcmp(floatType,
"float")) {
2745 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = %s(angle*%.17f%s);\n",
w, cosDef, 2.0 * i / radix, LFending);
2748 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = %s(angle*%.17f%s);\n",
w, sinDef, 2.0 * i / radix, LFending);
2753 if (!strcmp(floatType,
"double")) {
2754 sc->
tempLen = sprintf(sc->
tempStr,
" %s = sincos_20(angle*%.17f%s);\n",
w, 2.0 * i / radix, LFending);
2762 sc->
tempLen = sprintf(sc->
tempStr,
" %s = twiddleLUT[LUTId+%" PRIu64
"];\n\n",
w, (radix - 1 - i) * stageSize);
2772 if (!strcmp(floatType,
"float")) {
2773 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = %s(angle*%.17f%s);\n",
w, cosDef, 2.0 * i / radix, LFending);
2776 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = %s(angle*%.17f%s);\n",
w, sinDef, 2.0 * i / radix, LFending);
2781 if (!strcmp(floatType,
"double")) {
2782 sc->
tempLen = sprintf(sc->
tempStr,
" %s = sincos_20(angle*%.17f%s);\n",
w, 2.0 * i / radix, LFending);
2794 uint64_t permute[13] = { 0,1,3,9,5,2,6,12,10,4,8,11,7 };
2797 for (uint64_t i = 0; i < 6; i++) {
2803 for (uint64_t i = 0; i < 3; i++) {
2809 for (uint64_t i = 0; i < 4; i++) {
2812 res =
VkSubComplex(sc, sc->
locID[i * 2 + 5], regID[i * 3 + 1], regID[i * 3 + 3]);
2816 res =
VkSubComplex(sc, sc->
locID[i * 2 + 6], regID[i * 3 + 2], regID[i * 3 + 3]);
2826 for (uint64_t k = 0; k < 3; k++) {
2846 res =
VkAddComplex(sc, regID[k * 2 + 3], sc->
locID[k * 2 + 3], regID[k * 2 + 4]);
2848 res =
VkAddComplex(sc, regID[k * 2 + 4], sc->
locID[k * 2 + 4], regID[k * 2 + 4]);
2907 for (uint64_t i = 0; i < 4; i++) {
2917 for (uint64_t i = 0; i < 3; i++) {
2925 for (uint64_t i = 0; i < 6; i++) {
2931 uint64_t permute2[13] = { 0,12,1,10,5,3,2,8,9,11,4,7,6 };
2932 res =
VkPermute(sc, permute2, 13, 1, regID);
2935 for (uint64_t i = 0; i < 20; i++) {
2947 char sharedDefinitions[20] =
"";
2948 uint64_t vecSize = 1;
2949 uint64_t maxSequenceSharedMemory = 0;
2951 if (!strcmp(floatType,
"float"))
2953#if(VKFFT_BACKEND==0)
2954 sprintf(vecType,
"vec2");
2955 sprintf(sharedDefinitions,
"shared");
2956#elif(VKFFT_BACKEND==1)
2957 sprintf(vecType,
"float2");
2958 sprintf(sharedDefinitions,
"__shared__");
2959#elif(VKFFT_BACKEND==2)
2960 sprintf(vecType,
"float2");
2961 sprintf(sharedDefinitions,
"__shared__");
2962#elif(VKFFT_BACKEND==3)
2963 sprintf(vecType,
"float2");
2964 sprintf(sharedDefinitions,
"__local");
2968 if (!strcmp(floatType,
"double")) {
2969#if(VKFFT_BACKEND==0)
2970 sprintf(vecType,
"dvec2");
2971 sprintf(sharedDefinitions,
"shared");
2972#elif(VKFFT_BACKEND==1)
2973 sprintf(vecType,
"double2");
2974 sprintf(sharedDefinitions,
"__shared__");
2975#elif(VKFFT_BACKEND==2)
2976 sprintf(vecType,
"double2");
2977 sprintf(sharedDefinitions,
"__shared__");
2978#elif(VKFFT_BACKEND==3)
2979 sprintf(vecType,
"double2");
2980 sprintf(sharedDefinitions,
"__local");
2987 switch (sharedType) {
2988 case 0:
case 5:
case 6:
case 110:
case 120:
case 130:
case 140:
case 142:
case 144:
3005#if(VKFFT_BACKEND==0)
3006 sc->
tempLen = sprintf(sc->
tempStr,
"%s %s sdata[%" PRIu64
"];// sharedStride - fft size, gl_WorkGroupSize.y - grouped consecutive ffts\n\n", sharedDefinitions, vecType, sc->
localSize[1] * sc->
maxSharedStride);
3009#elif(VKFFT_BACKEND==1)
3011 sc->
tempLen = sprintf(sc->
tempStr,
"%s* sdata = (%s*)shared;\n\n", vecType, vecType);
3015#elif(VKFFT_BACKEND==2)
3017 sc->
tempLen = sprintf(sc->
tempStr,
"%s* sdata = (%s*)shared;\n\n", vecType, vecType);
3021#elif(VKFFT_BACKEND==3)
3022 sc->
tempLen = sprintf(sc->
tempStr,
"%s %s sdata[%" PRIu64
"];// sharedStride - fft size, gl_WorkGroupSize.y - grouped consecutive ffts\n\n", sharedDefinitions, vecType, sc->
localSize[1] * sc->
maxSharedStride);
3029 case 1:
case 2:
case 111:
case 121:
case 131:
case 141:
case 143:
case 145:
3038#if(VKFFT_BACKEND==0)
3042#elif(VKFFT_BACKEND==1)
3044 sc->
tempLen = sprintf(sc->
tempStr,
"%s* sdata = (%s*)shared;\n\n", vecType, vecType);
3048#elif(VKFFT_BACKEND==2)
3050 sc->
tempLen = sprintf(sc->
tempStr,
"%s* sdata = (%s*)shared;\n\n", vecType, vecType);
3054#elif(VKFFT_BACKEND==3)
3068#if(VKFFT_BACKEND==0)
3069 if (!strcmp(floatType,
"float")) sprintf(vecType,
"vec2");
3070 if (!strcmp(floatType,
"double")) sprintf(vecType,
"dvec2");
3071#elif(VKFFT_BACKEND==1)
3072 if (!strcmp(floatType,
"float")) sprintf(vecType,
"float2");
3073 if (!strcmp(floatType,
"double")) sprintf(vecType,
"double2");
3074#elif(VKFFT_BACKEND==2)
3075 if (!strcmp(floatType,
"float")) sprintf(vecType,
"float2");
3076 if (!strcmp(floatType,
"double")) sprintf(vecType,
"double2");
3077#elif(VKFFT_BACKEND==3)
3078 if (!strcmp(floatType,
"float")) sprintf(vecType,
"float2");
3079 if (!strcmp(floatType,
"double")) sprintf(vecType,
"double2");
3086 sc->
tempLen = sprintf(sc->
tempStr,
" %s temp_%" PRIu64
";\n", vecType, i);
3089 sc->
tempLen = sprintf(sc->
tempStr,
" temp_%" PRIu64
".x=0;\n", i);
3092 sc->
tempLen = sprintf(sc->
tempStr,
" temp_%" PRIu64
".y=0;\n", i);
3098 sc->
tempLen = sprintf(sc->
tempStr,
" %s temp_%" PRIu64
"_%" PRIu64
";\n", vecType, i, j);
3101 sc->
tempLen = sprintf(sc->
tempStr,
" temp_%" PRIu64
"_%" PRIu64
".x=0;\n", i, j);
3104 sc->
tempLen = sprintf(sc->
tempStr,
" temp_%" PRIu64
"_%" PRIu64
".y=0;\n", i, j);
3112 sc->
tempLen = sprintf(sc->
tempStr,
" %s temp_%" PRIu64
";\n", vecType, i);
3115 sc->
tempLen = sprintf(sc->
tempStr,
" temp_%" PRIu64
".x=0;\n", i);
3118 sc->
tempLen = sprintf(sc->
tempStr,
" temp_%" PRIu64
".y=0;\n", i);
3126 sc->
regIDs = (
char**)malloc(
sizeof(
char*) * logicalStoragePerThread);
3128 for (uint64_t i = 0; i < logicalStoragePerThread; i++) {
3129 sc->
regIDs[i] = (
char*)malloc(
sizeof(
char) * 50);
3131 for (uint64_t j = 0; j < i; j++) {
3139 if (i < logicalRegistersPerThread)
3140 sprintf(sc->
regIDs[i],
"temp_%" PRIu64
"", i);
3142 sprintf(sc->
regIDs[i],
"temp_%" PRIu64
"", i);
3177 sprintf(sc->
w,
"w");
3178 uint64_t maxNonPow2Radix = 1;
3179 if (sc->
fftDim % 3 == 0) maxNonPow2Radix = 3;
3180 if (sc->
fftDim % 5 == 0) maxNonPow2Radix = 5;
3181 if (sc->
fftDim % 7 == 0) maxNonPow2Radix = 7;
3182 if (sc->
fftDim % 11 == 0) maxNonPow2Radix = 11;
3183 if (sc->
fftDim % 13 == 0) maxNonPow2Radix = 13;
3184 for (uint64_t i = 0; i < maxNonPow2Radix; i++) {
3185 sprintf(sc->
locID[i],
"loc_%" PRIu64
"", i);
3197 uint64_t useRadix8 = 0;
3198 for (uint64_t i = 0; i < sc->
numStages; i++)
3200 if (useRadix8 == 1) {
3201 if (maxNonPow2Radix > 1) sprintf(sc->
iw,
"%s", sc->
locID[1]);
3212 sprintf(sc->
iw,
"iw");
3254 %s tshuffle= ((%s>>1))%%(%" PRIu64
");\n\
3261 sc->
tempLen = sprintf(sc->
tempStr,
" shuffle[%" PRIu64
"].x = 0;\n", i);
3264 sc->
tempLen = sprintf(sc->
tempStr,
" shuffle[%" PRIu64
"].y = 0;\n", i);
3753 case 0:
case 5:
case 6:
case 110:
case 120:
case 130:
case 140:
case 142:
case 144:
3763 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
")].x = 0;\n", sc->
fftDim, sc->
fftDim);
3766 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
")].y = 0;\n", sc->
fftDim, sc->
fftDim);
3771 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].x = 0;\n", sc->
fftDim, sc->
fftDim);
3774 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].y = 0;\n", sc->
fftDim, sc->
fftDim);
3780 case 1:
case 2:
case 111:
case 121:
case 131:
case 141:
case 143:
case 145:
3840 case 110:
case 111:
case 120:
case 121:
case 130:
case 131:
case 140:
case 141:
case 142:
case 143:
3855 double double_PI = 3.1415926535897932384626433832795;
3857 char inputsStruct[20] =
"";
3858 char LFending[4] =
"";
3859 if (!strcmp(floatType,
"float")) sprintf(LFending,
"f");
3860#if(VKFFT_BACKEND==0)
3862 sprintf(inputsStruct,
"inputs");
3864 sprintf(inputsStruct,
".inputs");
3865 if (!strcmp(floatType,
"float")) sprintf(vecType,
"vec2");
3866 if (!strcmp(floatType,
"double")) sprintf(vecType,
"dvec2");
3867 if (!strcmp(floatType,
"double")) sprintf(LFending,
"LF");
3868 char cosDef[20] =
"cos";
3869 char sinDef[20] =
"sin";
3870#elif(VKFFT_BACKEND==1)
3871 if (!strcmp(floatType,
"float")) sprintf(vecType,
"float2");
3872 if (!strcmp(floatType,
"double")) sprintf(vecType,
"double2");
3873 if (!strcmp(floatType,
"double")) sprintf(LFending,
"l");
3874 sprintf(inputsStruct,
"inputs");
3875 char cosDef[20] =
"__cosf";
3876 char sinDef[20] =
"__sinf";
3877#elif(VKFFT_BACKEND==2)
3878 if (!strcmp(floatType,
"float")) sprintf(vecType,
"float2");
3879 if (!strcmp(floatType,
"double")) sprintf(vecType,
"double2");
3880 if (!strcmp(floatType,
"double")) sprintf(LFending,
"l");
3881 sprintf(inputsStruct,
"inputs");
3882 char cosDef[20] =
"__cosf";
3883 char sinDef[20] =
"__sinf";
3884#elif(VKFFT_BACKEND==3)
3885 if (!strcmp(floatType,
"float")) sprintf(vecType,
"float2");
3886 if (!strcmp(floatType,
"double")) sprintf(vecType,
"double2");
3887 sprintf(inputsStruct,
"inputs");
3888 char cosDef[20] =
"native_cos";
3889 char sinDef[20] =
"native_sin";
3891 char convTypeLeft[20] =
"";
3892 char convTypeRight[20] =
"";
3893 if ((!strcmp(floatType,
"float")) && (strcmp(floatTypeMemory,
"float"))) {
3894 if ((readType == 5) || (readType == 110) || (readType == 111) || (readType == 120) || (readType == 121) || (readType == 130) || (readType == 131) || (readType == 140) || (readType == 141) || (readType == 142) || (readType == 143) || (readType == 144) || (readType == 145)) {
3895#if(VKFFT_BACKEND==0)
3896 sprintf(convTypeLeft,
"float(");
3897 sprintf(convTypeRight,
")");
3898#elif(VKFFT_BACKEND==1)
3899 sprintf(convTypeLeft,
"(float)");
3901#elif(VKFFT_BACKEND==2)
3902 sprintf(convTypeLeft,
"(float)");
3904#elif(VKFFT_BACKEND==3)
3905 sprintf(convTypeLeft,
"(float)");
3910#if(VKFFT_BACKEND==0)
3911 sprintf(convTypeLeft,
"vec2(");
3912 sprintf(convTypeRight,
")");
3913#elif(VKFFT_BACKEND==1)
3914 sprintf(convTypeLeft,
"conv_float2(");
3915 sprintf(convTypeRight,
")");
3916#elif(VKFFT_BACKEND==2)
3917 sprintf(convTypeLeft,
"conv_float2(");
3918 sprintf(convTypeRight,
")");
3919#elif(VKFFT_BACKEND==3)
3920 sprintf(convTypeLeft,
"conv_float2(");
3921 sprintf(convTypeRight,
")");
3925 if ((!strcmp(floatType,
"double")) && (strcmp(floatTypeMemory,
"double"))) {
3926 if ((readType == 5) || (readType == 110) || (readType == 111) || (readType == 120) || (readType == 121) || (readType == 130) || (readType == 131) || (readType == 140) || (readType == 141) || (readType == 142) || (readType == 143) || (readType == 144) || (readType == 145)) {
3927#if(VKFFT_BACKEND==0)
3928 sprintf(convTypeLeft,
"double(");
3929 sprintf(convTypeRight,
")");
3930#elif(VKFFT_BACKEND==1)
3931 sprintf(convTypeLeft,
"(double)");
3933#elif(VKFFT_BACKEND==2)
3934 sprintf(convTypeLeft,
"(double)");
3936#elif(VKFFT_BACKEND==3)
3937 sprintf(convTypeLeft,
"(double)");
3942#if(VKFFT_BACKEND==0)
3943 sprintf(convTypeLeft,
"dvec2(");
3944 sprintf(convTypeRight,
")");
3945#elif(VKFFT_BACKEND==1)
3946 sprintf(convTypeLeft,
"conv_double2(");
3947 sprintf(convTypeRight,
")");
3948#elif(VKFFT_BACKEND==2)
3949 sprintf(convTypeLeft,
"conv_double2(");
3950 sprintf(convTypeRight,
")");
3951#elif(VKFFT_BACKEND==3)
3952 sprintf(convTypeLeft,
"conv_double2(");
3953 sprintf(convTypeRight,
")");
3957 char index_x[2000] =
"";
3958 char index_y[2000] =
"";
3959 char requestCoordinate[100] =
"";
3962 sprintf(requestCoordinate,
"coordinate");
3965 char requestBatch[100] =
"";
3968 sprintf(requestBatch,
"0");
3976 char shiftX[500] =
"";
3978 sprintf(shiftX,
" + consts.workGroupShiftX ");
3979 char shiftY[500] =
"";
3988 char shiftY2[100] =
"";
3990 sprintf(shiftY,
" + consts.workGroupShiftY ");
3998 sprintf(sc->
disableThreadsStart,
" if(%s * %" PRIu64
" + (((%s%s) %% %" PRIu64
") * %" PRIu64
" + ((%s%s) / %" PRIu64
") * %" PRIu64
") < %" PRIu64
") {\n", sc->
gl_LocalInvocationID_x, sc->
firstStageStartSize, sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
fftDim, sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
localSize[0] * sc->
firstStageStartSize, sc->
fft_dim_full);
4005 sprintf(sc->
disableThreadsStart,
" if(%s * %" PRIu64
" + (((%s%s) %% %" PRIu64
") * %" PRIu64
" + ((%s%s) / %" PRIu64
") * %" PRIu64
") < %" PRIu64
") {\n", sc->
gl_LocalInvocationID_y, sc->
firstStageStartSize, sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
fftDim, sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
localSize[1] * sc->
firstStageStartSize, sc->
fft_dim_full);
4076 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
")] = %s%s[%s]%s;\n", sc->
fftDim, sc->
fftDim, convTypeLeft, inputsStruct, sc->
inoutID, convTypeRight);
4078 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
")] = %sinputBlocks[%s / %" PRIu64
"]%s[%s %% %" PRIu64
"]%s;\n", sc->
fftDim, sc->
fftDim, convTypeLeft, sc->
inoutID, sc->
inputBufferBlockSize, inputsStruct, sc->
inoutID, sc->
inputBufferBlockSize, convTypeRight);
4082 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride] = %s%s[%s]%s;\n", sc->
fftDim, sc->
fftDim, convTypeLeft, inputsStruct, sc->
inoutID, convTypeRight);
4084 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride] = %sinputBlocks[%s / %" PRIu64
"]%s[%s %% %" PRIu64
"]%s;\n", sc->
fftDim, sc->
fftDim, convTypeLeft, sc->
inoutID, sc->
inputBufferBlockSize, inputsStruct, sc->
inoutID, sc->
inputBufferBlockSize, convTypeRight);
4102 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
")].x = 0;\n", sc->
fftDim, sc->
fftDim);
4105 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
")].y = 0;\n", sc->
fftDim, sc->
fftDim);
4110 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].x = 0;\n", sc->
fftDim, sc->
fftDim);
4113 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].y = 0;\n", sc->
fftDim, sc->
fftDim);
4133 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
")].x = 0;\n", sc->
fftDim, sc->
fftDim);
4136 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
")].y = 0;\n", sc->
fftDim, sc->
fftDim);
4141 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].x = 0;\n", sc->
fftDim, sc->
fftDim);
4144 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].y = 0;\n", sc->
fftDim, sc->
fftDim);
4189 sc->
tempLen = sprintf(sc->
tempStr,
" inoutID = (combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * %" PRIu64
" + (((%s%s) %% %" PRIu64
") * %" PRIu64
" + ((%s%s) / %" PRIu64
") * %" PRIu64
");\n", sc->
fftDim, sc->
fftDim, sc->
firstStageStartSize, sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
fftDim, sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
localSize[0] * sc->
firstStageStartSize);
4194 sc->
tempLen = sprintf(sc->
tempStr,
" inoutID = %s+%" PRIu64
"+%s * %" PRIu64
" + (((%s%s) %% %" PRIu64
") * %" PRIu64
" + ((%s%s) / %" PRIu64
") * %" PRIu64
");\n", sc->
gl_LocalInvocationID_x, (i + k * sc->
min_registers_per_thread) * sc->
localSize[0], sc->
gl_LocalInvocationID_y, sc->
firstStageStartSize, sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
fftDim, sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
localSize[1] * sc->
firstStageStartSize);
4225 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID / %" PRIu64
") + sharedStride*(combinedID %% %" PRIu64
")] = %s%s[inoutID]%s;\n", sc->
fftDim, sc->
fftDim, convTypeLeft, inputsStruct, convTypeRight);
4253 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID / %" PRIu64
") + sharedStride*(combinedID %% %" PRIu64
")].x = 0;\n", sc->
fftDim, sc->
fftDim);
4256 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID / %" PRIu64
") + sharedStride*(combinedID %% %" PRIu64
")].y = 0;\n", sc->
fftDim, sc->
fftDim);
4283 char shiftX[500] =
"";
4287 sprintf(sc->
disableThreadsStart,
" if (((%s%s) / %" PRIu64
") %% (%" PRIu64
")+((%s%s) / %" PRIu64
") * (%" PRIu64
") < %" PRIu64
") {\n", sc->
gl_GlobalInvocationID_x, shiftX, sc->
fft_dim_x, sc->
stageStartSize, sc->
gl_GlobalInvocationID_x, shiftX, sc->
fft_dim_x * sc->
stageStartSize, sc->
fftDim * sc->
stageStartSize, sc->
size[sc->
axis_id]);
4293 sc->
tempLen = sprintf(sc->
tempStr,
" inoutID = (%" PRIu64
" * (%s + %" PRIu64
") + ((%s%s) / %" PRIu64
") %% (%" PRIu64
")+((%s%s) / %" PRIu64
") * (%" PRIu64
"));\n", sc->
stageStartSize, sc->
gl_LocalInvocationID_y, (i + k * sc->
min_registers_per_thread) * sc->
localSize[1], sc->
gl_GlobalInvocationID_x, shiftX, sc->
fft_dim_x, sc->
stageStartSize, sc->
gl_GlobalInvocationID_x, shiftX, sc->
fft_dim_x * sc->
stageStartSize, sc->
fftDim * sc->
stageStartSize);
4329 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[%s*(%s+%" PRIu64
")+%s]=%sinputBlocks[%s / %" PRIu64
"]%s[%s %% %" PRIu64
"]%s;\n", sc->
sharedStride, sc->
gl_LocalInvocationID_y, (i + k * sc->
min_registers_per_thread) * sc->
localSize[1], sc->
gl_LocalInvocationID_x, convTypeLeft, sc->
inoutID, sc->
inputBufferBlockSize, inputsStruct, sc->
inoutID, sc->
inputBufferBlockSize, convTypeRight);
4386 char shiftX[500] =
"";
4397 sc->
tempLen = sprintf(sc->
tempStr,
" inoutID = (%s%s) %% (%" PRIu64
") + %" PRIu64
" * (%s + %" PRIu64
") + ((%s%s) / %" PRIu64
") * (%" PRIu64
");\n", sc->
gl_GlobalInvocationID_x, shiftX, sc->
stageStartSize, sc->
stageStartSize, sc->
gl_LocalInvocationID_y, (i + k * sc->
min_registers_per_thread) * sc->
localSize[1], sc->
gl_GlobalInvocationID_x, shiftX, sc->
stageStartSize, sc->
stageStartSize * sc->
fftDim);
4432 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[%s*(%s+%" PRIu64
")+%s]=%sinputBlocks[%s / %" PRIu64
"]%s[%s %% %" PRIu64
"]%s;\n", sc->
sharedStride, sc->
gl_LocalInvocationID_y, (i + k * sc->
min_registers_per_thread) * sc->
localSize[1], sc->
gl_LocalInvocationID_x, convTypeLeft, sc->
inoutID, sc->
inputBufferBlockSize, inputsStruct, sc->
inoutID, sc->
inputBufferBlockSize, convTypeRight);
4489 char shiftX[500] =
"";
4491 sprintf(shiftX,
" + consts.workGroupShiftX ");
4492 char shiftY[500] =
"";
4494 sprintf(shiftY,
" + consts.workGroupShiftY ");
4514 if ((uint64_t)ceil(sc->
size[1] / (
double)mult) % sc->
localSize[0] != 0) {
4521 if ((uint64_t)ceil(sc->
size[1] / (
double)mult) % sc->
localSize[1] != 0) {
4558 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = %sinputBlocks[(%s + %" PRIu64
")/ %" PRIu64
"]%s[(%s + %" PRIu64
") %% %" PRIu64
"]%s;\n", sc->
regIDs[i + k * sc->
registers_per_thread], convTypeLeft, sc->
inoutID, sc->
inputStride[1], sc->
inputBufferBlockSize, inputsStruct, sc->
inoutID, sc->
inputStride[1], sc->
inputBufferBlockSize, convTypeRight);
4575 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
")].x = %s%s[%s]%s;\n", sc->
fftDim, sc->
fftDim, convTypeLeft, inputsStruct, sc->
inoutID, convTypeRight);
4577 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
")].x = %sinputBlocks[%s / %" PRIu64
"]%s[%s %% %" PRIu64
"]%s;\n", sc->
fftDim, sc->
fftDim, convTypeLeft, sc->
inoutID, sc->
inputBufferBlockSize, inputsStruct, sc->
inoutID, sc->
inputBufferBlockSize, convTypeRight);
4586 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
")* sharedStride + (combinedID / %" PRIu64
")].y = %s%s[inoutID]%s;\n", sc->
fftDim, sc->
fftDim, convTypeLeft, inputsStruct, convTypeRight);
4594 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
") * sharedStride+ (combinedID / %" PRIu64
")].y = 0;\n", sc->
fftDim, sc->
fftDim);
4596 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
")].y = 0;\n", sc->
fftDim, sc->
fftDim);
4603 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].x = %s%s[inoutID]%s;\n", sc->
fftDim, sc->
fftDim, convTypeLeft, inputsStruct, convTypeRight);
4613 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].y = %s%s[inoutID]%s;\n", sc->
fftDim, sc->
fftDim, convTypeLeft, inputsStruct, convTypeRight);
4621 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].y = 0;\n", sc->
fftDim, sc->
fftDim);
4623 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].y = 0;\n", sc->
fftDim, sc->
fftDim);
4643 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
")].x = 0;\n", sc->
fftDim, sc->
fftDim);
4646 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
")].y = 0;\n", sc->
fftDim, sc->
fftDim);
4651 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].x = 0;\n", sc->
fftDim, sc->
fftDim);
4654 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].y = 0;\n", sc->
fftDim, sc->
fftDim);
4676 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
")].x = 0;\n", sc->
fftDim, sc->
fftDim);
4679 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
")].y = 0;\n", sc->
fftDim, sc->
fftDim);
4684 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].x = 0;\n", sc->
fftDim, sc->
fftDim);
4687 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].y = 0;\n", sc->
fftDim, sc->
fftDim);
4699 if ((uint64_t)ceil(sc->
size[1] / (
double)mult) % sc->
localSize[0] != 0) {
4706 if ((uint64_t)ceil(sc->
size[1] / (
double)mult) % sc->
localSize[1] != 0) {
4722 char shiftX[500] =
"";
4724 sprintf(shiftX,
" + consts.workGroupShiftX ");
4725 char shiftY[500] =
"";
4728 char shiftY2[100] =
"";
4730 sprintf(shiftY,
" + consts.workGroupShiftY ");
4733 sprintf(sc->
disableThreadsStart,
" if(%s * %" PRIu64
" + (((%s%s) %% %" PRIu64
") * %" PRIu64
" + ((%s%s) / %" PRIu64
") * %" PRIu64
") < %" PRIu64
") {\n", sc->
gl_LocalInvocationID_x, sc->
firstStageStartSize, sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
fftDim, sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
localSize[0] * sc->
firstStageStartSize, sc->
fft_dim_full);
4735 sprintf(sc->
disableThreadsStart,
" if(%s * %" PRIu64
" + (((%s%s) %% %" PRIu64
") * %" PRIu64
" + ((%s%s) / %" PRIu64
") * %" PRIu64
") < %" PRIu64
") {\n", sc->
gl_LocalInvocationID_y, sc->
firstStageStartSize, sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
fftDim, sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
localSize[1] * sc->
firstStageStartSize, sc->
fft_dim_full);
4753 for (uint64_t i = 0; i < num_in; i++) {
4764 sc->
tempLen = sprintf(sc->
tempStr,
" inoutID = (combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * %" PRIu64
";\n", sc->
fftDim / 2 + 1, sc->
fftDim / 2 + 1, sc->
inputStride[1]);
4817 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride] = %s%s[%s]%s;\n", mult * (sc->
fftDim / 2 + 1), mult * (sc->
fftDim / 2 + 1), convTypeLeft, inputsStruct, sc->
inoutID, convTypeRight);
4819 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride] = %sinputBlocks[%s / %" PRIu64
"]%s[%s %% %" PRIu64
"]%s;\n", mult * (sc->
fftDim / 2 + 1), mult * (sc->
fftDim / 2 + 1), convTypeLeft, sc->
inoutID, sc->
inputBufferBlockSize, inputsStruct, sc->
inoutID, sc->
inputBufferBlockSize, convTypeRight);
4825 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
")] = %s%s[%s]%s;\n", mult * (sc->
fftDim / 2 + 1), mult * (sc->
fftDim / 2 + 1), convTypeLeft, inputsStruct, sc->
inoutID, convTypeRight);
4827 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
")] = %sinputBlocks[%s / %" PRIu64
"]%s[%s %% %" PRIu64
"]%s;\n", mult * (sc->
fftDim / 2 + 1), mult * (sc->
fftDim / 2 + 1), convTypeLeft, sc->
inoutID, sc->
inputBufferBlockSize, inputsStruct, sc->
inoutID, sc->
inputBufferBlockSize, convTypeRight);
4846 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].x = 0;\n", mult * (sc->
fftDim / 2 + 1), mult * (sc->
fftDim / 2 + 1));
4849 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].y = 0;\n", mult * (sc->
fftDim / 2 + 1), mult * (sc->
fftDim / 2 + 1));
4854 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
")].x = 0;\n", mult * (sc->
fftDim / 2 + 1), mult * (sc->
fftDim / 2 + 1));
4857 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
")].y = 0;\n", mult * (sc->
fftDim / 2 + 1), mult * (sc->
fftDim / 2 + 1));
4910 if (i >= (uint64_t)ceil((sc->
fftDim / 2 + 1) / (
double)sc->
localSize[1])) {
4982 if (i >= (uint64_t)ceil((sc->
fftDim / 2 + 1) / (
double)sc->
localSize[0])) {
5056 if (i >= (uint64_t)ceil((sc->
fftDim / 2 + 1) / (
double)sc->
localSize[1])) {
5128 if (i >= (uint64_t)ceil((sc->
fftDim / 2 + 1) / (
double)sc->
localSize[0])) {
5207 char shiftX[500] =
"";
5209 sprintf(shiftX,
" + consts.workGroupShiftX ");
5210 char shiftY[500] =
"";
5212 sprintf(shiftY,
" + consts.workGroupShiftY ");
5225 for (uint64_t i = 0; i < num_in; i++) {
5240 if ((uint64_t)ceil(sc->
size[1] / (
double)mult) % sc->
localSize[0] != 0) {
5257 if ((uint64_t)ceil(sc->
size[1] / (
double)mult) % sc->
localSize[1] != 0) {
5289 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
");\n", sc->
fftDim, sc->
fftDim);
5294 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[sdataID].x = %s%s[%s]%s;\n", convTypeLeft, inputsStruct, sc->
inoutID, convTypeRight);
5306 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[sdataID].y = %s%s[inoutID]%s;\n", convTypeLeft, inputsStruct, convTypeRight);
5320 sc->
tempLen = sprintf(sc->
tempStr,
" if (((combinedID %% %" PRIu64
")>0)&&((combinedID %% %" PRIu64
") < %" PRIu64
")){\n", sc->
fftDim, sc->
fftDim, sc->
fftDim-1);
5323 sc->
tempLen = sprintf(sc->
tempStr,
" inoutID = (%" PRIu64
" - combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
");\n", 2*sc->
fftDim - 2, sc->
fftDim, sc->
fftDim);
5326 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[inoutID] = sdata[sdataID];\n");
5334 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride;\n", sc->
fftDim, sc->
fftDim);
5338 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[sdataID].x = %s%s[inoutID]%s;\n", convTypeLeft, inputsStruct, convTypeRight);
5348 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[sdataID].y = %s%s[inoutID]%s;\n", convTypeLeft, inputsStruct, convTypeRight);
5362 sc->
tempLen = sprintf(sc->
tempStr,
" if (((combinedID %% %" PRIu64
")>0)&&((combinedID %% %" PRIu64
") < %" PRIu64
")){\n", sc->
fftDim, sc->
fftDim, sc->
fftDim - 1);
5365 sc->
tempLen = sprintf(sc->
tempStr,
" inoutID = (%" PRIu64
" - combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride;\n", 2 * sc->
fftDim - 2, sc->
fftDim, sc->
fftDim);
5368 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[inoutID] = sdata[sdataID];\n");
5388 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
");\n", sc->
fftDim, sc->
fftDim);
5393 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride;\n", sc->
fftDim, sc->
fftDim);
5405 sc->
tempLen = sprintf(sc->
tempStr,
" if (((combinedID %% %" PRIu64
")>0)&&((combinedID %% %" PRIu64
") < %" PRIu64
")){\n", sc->
fftDim, sc->
fftDim, sc->
fftDim - 1);
5408 sc->
tempLen = sprintf(sc->
tempStr,
" inoutID = (%" PRIu64
" - combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
");\n", 2 * sc->
fftDim - 2, sc->
fftDim, sc->
fftDim);
5411 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[inoutID] = sdata[sdataID];\n");
5419 sc->
tempLen = sprintf(sc->
tempStr,
" if (((combinedID %% %" PRIu64
")>0)&&((combinedID %% %" PRIu64
") < %" PRIu64
")){\n", sc->
fftDim, sc->
fftDim, sc->
fftDim - 1);
5422 sc->
tempLen = sprintf(sc->
tempStr,
" inoutID = (%" PRIu64
" - combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride;\n", 2 * sc->
fftDim - 2, sc->
fftDim, sc->
fftDim);
5425 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[inoutID] = sdata[sdataID];\n");
5442 if ((uint64_t)ceil(sc->
size[1] / (
double)mult) % sc->
localSize[0] != 0) {
5454 if ((uint64_t)ceil(sc->
size[1] / (
double)mult) % sc->
localSize[1] != 0) {
5472 char shiftX[500] =
"";
5474 sprintf(shiftX,
" + consts.workGroupShiftX ");
5475 char shiftX2[500] =
"";
5478 char shiftY[500] =
"";
5480 sprintf(shiftY,
" + consts.workGroupShiftY ");
5491 uint64_t num_in = (uint64_t)ceil((sc->
fftDim) / (double)sc->
localSize[1]);
5493 for (uint64_t i = 0; i < num_in; i++) {
5499 if ((uint64_t)ceil(sc->
size[0] / (
double)mult) % sc->
localSize[0] != 0) {
5521 sc->
tempLen = sprintf(sc->
tempStr,
" //sdataID = (((combinedID %% %" PRIu64
") %% 2) * %" PRIu64
" + (1-2*((combinedID %% %" PRIu64
") %% 2)) * ((combinedID %% %" PRIu64
")/2)) * sharedStride + (%s + ((%s + %" PRIu64
") %% %" PRIu64
") * %" PRIu64
") / %" PRIu64
";\n", sc->
fftDim, sc->
fftDim - 1, sc->
fftDim, sc->
fftDim, sc->
gl_LocalInvocationID_x, sc->
gl_LocalInvocationID_y, (i + k * sc->
min_registers_per_thread) * sc->
localSize[1], mult, sc->
localSize[0], mult);
5539 res =
indexInputVkFFT(sc, uintType, readType, index_x, index_y, requestCoordinate, requestBatch);
5552 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = %s%s[%s]%s;\n", sc->
regIDs[0], convTypeLeft, inputsStruct, sc->
inoutID, convTypeRight);
5583 sc->
tempLen = sprintf(sc->
tempStr,
" if (((combinedID %% %" PRIu64
")>0)&&((combinedID %% %" PRIu64
") < %" PRIu64
")){\n", sc->
fftDim, sc->
fftDim, sc->
fftDim - 1);
5589 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[inoutID] = sdata[sdataID];\n");
5627 sc->
tempLen = sprintf(sc->
tempStr,
" if (((combinedID %% %" PRIu64
")>0)&&((combinedID %% %" PRIu64
") < %" PRIu64
")){\n", sc->
fftDim, sc->
fftDim, sc->
fftDim - 1);
5633 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[inoutID] = sdata[sdataID];\n");
5653 if ((uint64_t)ceil(sc->
size[0] / (
double)mult) % sc->
localSize[0] != 0) {
5670 char shiftX[500] =
"";
5672 sprintf(shiftX,
" + consts.workGroupShiftX ");
5673 char shiftY[500] =
"";
5675 sprintf(shiftY,
" + consts.workGroupShiftY ");
5701 if ((uint64_t)ceil(sc->
size[1] / (
double)mult) % sc->
localSize[0] != 0) {
5713 if ((uint64_t)ceil(sc->
size[1] / (
double)mult) % sc->
localSize[1] != 0) {
5740 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (((combinedID %% %" PRIu64
") %% 2) * %" PRIu64
" + (1-2*((combinedID %% %" PRIu64
") %% 2)) * ((combinedID %% %" PRIu64
")/2)) * sharedStride + (combinedID / %" PRIu64
");\n", sc->
fftDim, sc->
fftDim - 1, sc->
fftDim, sc->
fftDim, sc->
fftDim);
5745 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[sdataID].x = %s%s[%s]%s;\n", convTypeLeft, inputsStruct, sc->
inoutID, convTypeRight);
5757 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[sdataID].y = %s%s[inoutID]%s;\n", convTypeLeft, inputsStruct, convTypeRight);
5773 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (((combinedID %% %" PRIu64
") %% 2) * %" PRIu64
" + (1-2*((combinedID %% %" PRIu64
") %% 2)) * ((combinedID %% %" PRIu64
")/2)) + (combinedID / %" PRIu64
") * sharedStride;\n", sc->
fftDim, sc->
fftDim - 1, sc->
fftDim, sc->
fftDim, sc->
fftDim);
5777 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[sdataID].x = %s%s[inoutID]%s;\n", convTypeLeft, inputsStruct, convTypeRight);
5787 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[sdataID].y = %s%s[inoutID]%s;\n", convTypeLeft, inputsStruct, convTypeRight);
5815 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (((combinedID %% %" PRIu64
") %% 2) * %" PRIu64
" + (1-2*((combinedID %% %" PRIu64
") %% 2)) * ((combinedID %% %" PRIu64
")/2)) * sharedStride + (combinedID / %" PRIu64
");\n", sc->
fftDim, sc->
fftDim - 1, sc->
fftDim, sc->
fftDim, sc->
fftDim);
5820 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (((combinedID %% %" PRIu64
") %% 2) * %" PRIu64
" + (1-2*((combinedID %% %" PRIu64
") %% 2)) * ((combinedID %% %" PRIu64
")/2)) + (combinedID / %" PRIu64
") * sharedStride;\n", sc->
fftDim, sc->
fftDim - 1, sc->
fftDim, sc->
fftDim, sc->
fftDim);
5836 if ((uint64_t)ceil(sc->
size[1] / (
double)mult) % sc->
localSize[0] != 0) {
5843 if ((uint64_t)ceil(sc->
size[1] / (
double)mult) % sc->
localSize[1] != 0) {
5860 char shiftX[500] =
"";
5862 sprintf(shiftX,
" + consts.workGroupShiftX ");
5863 char shiftX2[500] =
"";
5866 char shiftY[500] =
"";
5868 sprintf(shiftY,
" + consts.workGroupShiftY ");
5885 if ((uint64_t)ceil(sc->
size[0] / (
double)mult) % sc->
localSize[0] != 0) {
5902 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (((combinedID %% %" PRIu64
") %% 2) * %" PRIu64
" + (1-2*((combinedID %% %" PRIu64
") %% 2)) * ((combinedID %% %" PRIu64
")/2)) * sharedStride + (%s + ((%s + %" PRIu64
") %% %" PRIu64
") * %" PRIu64
") / %" PRIu64
";\n", sc->
fftDim, sc->
fftDim - 1, sc->
fftDim, sc->
fftDim, sc->
gl_LocalInvocationID_x, sc->
gl_LocalInvocationID_y, (i + k * sc->
min_registers_per_thread) * sc->
localSize[1], mult, sc->
localSize[0], mult);
5920 res =
indexInputVkFFT(sc, uintType, readType, index_x, index_y, requestCoordinate, requestBatch);
5933 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = %s%s[%s]%s;\n", sc->
regIDs[0], convTypeLeft, inputsStruct, sc->
inoutID, convTypeRight);
6005 if ((uint64_t)ceil(sc->
size[0] / (
double)mult) % sc->
localSize[0] != 0) {
6021 char shiftX[500] =
"";
6023 sprintf(shiftX,
" + consts.workGroupShiftX ");
6024 char shiftY[500] =
"";
6026 sprintf(shiftY,
" + consts.workGroupShiftY ");
6038 for (uint64_t i = 0; i < num_in; i++) {
6095 sc->
tempLen = sprintf(sc->
tempStr,
" mult.x = %s(%.17f%s * (combinedID %% %" PRIu64
") );\n", cosDef, double_PI / 2 / sc->
fftDim, LFending, sc->
fftDim / 2 + 1);
6098 sc->
tempLen = sprintf(sc->
tempStr,
" mult.y = %s(%.17f%s * (combinedID %% %" PRIu64
") );\n", sinDef, double_PI / 2 / sc->
fftDim, LFending, sc->
fftDim / 2 + 1);
6108 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = %s%s[%s]%s;\n", sc->
regIDs[0], convTypeLeft, inputsStruct, sc->
inoutID, convTypeRight);
6120 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = %s%s[inoutID]%s;\n", sc->
regIDs[0], convTypeLeft, inputsStruct, convTypeRight);
6147 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
");\n", sc->
fftDim / 2 + 1, sc->
fftDim / 2 + 1);
6152 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride ;\n", sc->
fftDim / 2 + 1, sc->
fftDim / 2 + 1);
6156 sc->
tempLen = sprintf(sc->
tempStr,
" if (combinedID %% %" PRIu64
" > 0){\n", sc->
fftDim / 2 + 1);
6177 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = %s%s[%s]%s;\n", sc->
regIDs[1], convTypeLeft, inputsStruct, sc->
inoutID, convTypeRight);
6189 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = %s%s[inoutID]%s;\n", sc->
regIDs[1], convTypeLeft, inputsStruct, convTypeRight);
6221 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (%" PRIu64
" - combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
");\n", sc->
fftDim, sc->
fftDim / 2 + 1, sc->
fftDim / 2 + 1);
6226 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (%" PRIu64
" - combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride;\n", sc->
fftDim, sc->
fftDim / 2 + 1, sc->
fftDim / 2 + 1);
6295 char shiftX[500] =
"";
6297 sprintf(shiftX,
" + consts.workGroupShiftX ");
6298 char shiftX2[500] =
"";
6301 char shiftY[500] =
"";
6303 sprintf(shiftY,
" + consts.workGroupShiftY ");
6305 uint64_t num_in = (uint64_t)ceil((sc->
fftDim / 2 + 1) / (double)sc->
localSize[1]);
6316 for (uint64_t i = 0; i < num_in; i++) {
6322 if ((uint64_t)ceil(sc->
size[0] / (
double)mult) % sc->
localSize[0] != 0) {
6353 res =
indexInputVkFFT(sc, uintType, readType, index_x, index_y, requestCoordinate, requestBatch);
6367 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = %s%s[%s]%s;\n", sc->
regIDs[0], convTypeLeft, inputsStruct, sc->
inoutID, convTypeRight);
6398 sc->
tempLen = sprintf(sc->
tempStr,
" mult.x = %s(%.17f%s * (combinedID) );\n", cosDef, double_PI / 2 / sc->
fftDim, LFending);
6401 sc->
tempLen = sprintf(sc->
tempStr,
" mult.y = %s(%.17f%s * (combinedID) );\n", sinDef, double_PI / 2 / sc->
fftDim, LFending);
6432 res =
indexInputVkFFT(sc, uintType, readType, index_x, index_y, requestCoordinate, requestBatch);
6443 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = %s%s[%s]%s;\n", sc->
regIDs[1], convTypeLeft, inputsStruct, sc->
inoutID, convTypeRight);
6502 if ((uint64_t)ceil(sc->
size[0] / (
double)mult) % sc->
localSize[0] != 0) {
6524 char shiftX[500] =
"";
6526 sprintf(shiftX,
" + consts.workGroupShiftX ");
6527 char shiftY[500] =
"";
6536 char shiftY2[100] =
"";
6538 sprintf(shiftY,
" + consts.workGroupShiftY ");
6544 sprintf(sc->
disableThreadsStart,
" if(%s * %" PRIu64
" + (((%s%s) %% %" PRIu64
") * %" PRIu64
" + ((%s%s) / %" PRIu64
") * %" PRIu64
") < %" PRIu64
") {\n", sc->
gl_LocalInvocationID_x, sc->
firstStageStartSize, sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
fftDim, sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
localSize[0] * sc->
firstStageStartSize, sc->
fft_dim_full);
6551 sprintf(sc->
disableThreadsStart,
" if(%s * %" PRIu64
" + (((%s%s) %% %" PRIu64
") * %" PRIu64
" + ((%s%s) / %" PRIu64
") * %" PRIu64
") < %" PRIu64
") {\n", sc->
gl_LocalInvocationID_y, sc->
firstStageStartSize, sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
fftDim, sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
localSize[1] * sc->
firstStageStartSize, sc->
fft_dim_full);
6621 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = %s%s[%s]%s;\n", sc->
regIDs[0], convTypeLeft, inputsStruct, sc->
inoutID, convTypeRight);
6630 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[2*(combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
")] = %s;\n", sc->
fftDim / 8, sc->
fftDim / 8, sc->
regIDs[1]);
6633 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(2*(combinedID %% %" PRIu64
")+1) * sharedStride + (combinedID / %" PRIu64
")] = %s;\n", sc->
fftDim / 8, sc->
fftDim / 8, sc->
regIDs[0]);
6636 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(%" PRIu64
" - 2*(combinedID %% %" PRIu64
")) * sharedStride + (combinedID / %" PRIu64
")] = %s;\n", sc->
fftDim - 2, sc->
fftDim / 8, sc->
fftDim / 8, sc->
regIDs[1]);
6639 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(%" PRIu64
" - 2*(combinedID %% %" PRIu64
")) * sharedStride + (combinedID / %" PRIu64
")] = %s;\n", sc->
fftDim - 1, sc->
fftDim / 8, sc->
fftDim / 8, sc->
regIDs[0]);
6645 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(%" PRIu64
" - 2*(combinedID %% %" PRIu64
")) * sharedStride + (combinedID / %" PRIu64
")] = %s;\n", sc->
fftDim / 2 - 2, sc->
fftDim / 8, sc->
fftDim / 8, sc->
regIDs[1]);
6648 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(%" PRIu64
" - 2*(combinedID %% %" PRIu64
")) * sharedStride + (combinedID / %" PRIu64
")] = %s;\n", sc->
fftDim / 2 - 1, sc->
fftDim / 8, sc->
fftDim / 8, sc->
regIDs[0]);
6651 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(%" PRIu64
" + 2*(combinedID %% %" PRIu64
")) * sharedStride + (combinedID / %" PRIu64
")] = %s;\n", sc->
fftDim / 2, sc->
fftDim / 8, sc->
fftDim / 8, sc->
regIDs[1]);
6654 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(%" PRIu64
" + 2*(combinedID %% %" PRIu64
")) * sharedStride + (combinedID / %" PRIu64
")] = %s;\n", sc->
fftDim / 2 + 1, sc->
fftDim / 8, sc->
fftDim / 8, sc->
regIDs[0]);
6659 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[2*(combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride] = %s;\n", sc->
fftDim / 8, sc->
fftDim / 8, sc->
regIDs[1]);
6662 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(2*(combinedID %% %" PRIu64
")+1) + (combinedID / %" PRIu64
") * sharedStride] = %s;\n", sc->
fftDim / 8, sc->
fftDim / 8, sc->
regIDs[0]);
6665 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(%" PRIu64
" - 2*(combinedID %% %" PRIu64
")) + (combinedID / %" PRIu64
") * sharedStride] = %s;\n", sc->
fftDim - 2, sc->
fftDim / 8, sc->
fftDim / 8, sc->
regIDs[1]);
6668 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(%" PRIu64
" - 2*(combinedID %% %" PRIu64
")) + (combinedID / %" PRIu64
") * sharedStride] = %s;\n", sc->
fftDim - 1, sc->
fftDim / 8, sc->
fftDim / 8, sc->
regIDs[0]);
6674 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(%" PRIu64
" - 2*(combinedID %% %" PRIu64
")) + (combinedID / %" PRIu64
") * sharedStride] = %s;\n", sc->
fftDim / 2 - 2, sc->
fftDim / 8, sc->
fftDim / 8, sc->
regIDs[1]);
6677 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(%" PRIu64
" - 2*(combinedID %% %" PRIu64
")) + (combinedID / %" PRIu64
") * sharedStride] = %s;\n", sc->
fftDim / 2 - 1, sc->
fftDim / 8, sc->
fftDim / 8, sc->
regIDs[0]);
6680 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(%" PRIu64
" + 2*(combinedID %% %" PRIu64
")) + (combinedID / %" PRIu64
") * sharedStride] = %s;\n", sc->
fftDim / 2, sc->
fftDim / 8, sc->
fftDim / 8, sc->
regIDs[1]);
6683 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(%" PRIu64
" + 2*(combinedID %% %" PRIu64
")) + (combinedID / %" PRIu64
") * sharedStride] = %s;\n", sc->
fftDim / 2 + 1, sc->
fftDim / 8, sc->
fftDim / 8, sc->
regIDs[0]);
6700 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
")].x = 0;\n", sc->
fftDim, sc->
fftDim);
6703 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
")].y = 0;\n", sc->
fftDim, sc->
fftDim);
6708 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].x = 0;\n", sc->
fftDim, sc->
fftDim);
6711 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[(combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].y = 0;\n", sc->
fftDim, sc->
fftDim);
6845 char shiftX[500] =
"";
6849 sprintf(sc->
disableThreadsStart,
" if (((%s%s) / %" PRIu64
") %% (%" PRIu64
")+((%s%s) / %" PRIu64
") * (%" PRIu64
") < %" PRIu64
") {\n", sc->
gl_GlobalInvocationID_x, shiftX, sc->
fft_dim_x, sc->
stageStartSize, sc->
gl_GlobalInvocationID_x, shiftX, sc->
fft_dim_x * sc->
stageStartSize, sc->
fftDim * sc->
stageStartSize, sc->
size[sc->
axis_id]);
6873 sc->
tempLen = sprintf(sc->
tempStr,
" inoutID = (%" PRIu64
" * (%s + %" PRIu64
") + ((%s%s) / %" PRIu64
") %% (%" PRIu64
")+((%s%s) / %" PRIu64
") * (%" PRIu64
"));\n", sc->
stageStartSize, sc->
gl_LocalInvocationID_y, (i + k * sc->
min_registers_per_thread) * sc->
localSize[1], sc->
gl_GlobalInvocationID_x, shiftX, sc->
fft_dim_x, sc->
stageStartSize, sc->
gl_GlobalInvocationID_x, shiftX, sc->
fft_dim_x * sc->
stageStartSize, sc->
fftDim * sc->
stageStartSize);
6896 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = %s%s[%s]%s;\n", sc->
regIDs[0], convTypeLeft, inputsStruct, sc->
inoutID, convTypeRight);
6967 char shiftX[500] =
"";
6969 sprintf(shiftX,
" + consts.workGroupShiftX ");
6970 char shiftY[500] =
"";
6972 sprintf(shiftY,
" + consts.workGroupShiftY ");
6981 uint64_t maxBluesteinCutOff = 1;
7019 sc->
tempLen = sprintf(sc->
tempStr,
" if(combinedID < %" PRIu64
"){\n", maxBluesteinCutOff);
7038#if(VKFFT_BACKEND!=3)
7040 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = %s%s[%s]%s;\n", sc->
regIDs[0], convTypeLeft, inputsStruct, sc->
inoutID, convTypeRight);
7046 if (i < sc->min_registers_per_thread) {
7063#if(VKFFT_BACKEND!=3)
7066 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = ((combinedID %% %" PRIu64
")/2) * sharedStride + (combinedID / %" PRIu64
");\n", 2 * sc->
fftDim, 2 * sc->
fftDim);
7070 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = ((combinedID %% %" PRIu64
")/2) + (combinedID / %" PRIu64
") * sharedStride;\n", 2 * sc->
fftDim, 2 * sc->
fftDim);
7075 sc->
tempLen = sprintf(sc->
tempStr,
" if (((combinedID %% %" PRIu64
")%%2) == 0) {\n", 2 * sc->
fftDim);
7103 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = ((combinedID %% %" PRIu64
")/2) * sharedStride + (combinedID / %" PRIu64
");\n", 2 * sc->
fftDim, 2 * sc->
fftDim);
7107 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = ((combinedID %% %" PRIu64
")/2) + (combinedID / %" PRIu64
") * sharedStride;\n", 2 * sc->
fftDim, 2 * sc->
fftDim);
7111 sc->
tempLen = sprintf(sc->
tempStr,
" if (((combinedID %% %" PRIu64
")%%2) == 0) {\n", 2 * sc->
fftDim);
7120 sc->
tempLen = sprintf(sc->
tempStr,
" if (((combinedID %% %" PRIu64
")%%2) == 1) {\n", 2 * sc->
fftDim);
7154#if(VKFFT_BACKEND==3)
7186 sc->
tempLen = sprintf(sc->
tempStr,
" if(combinedID < %" PRIu64
"){\n", maxBluesteinCutOff);
7207 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = ((combinedID %% %" PRIu64
")/2) * sharedStride + (combinedID / %" PRIu64
");\n", 2 * sc->
fftDim, 2 * sc->
fftDim);
7211 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = ((combinedID %% %" PRIu64
")/2) + (combinedID / %" PRIu64
") * sharedStride;\n", 2 * sc->
fftDim, 2 * sc->
fftDim);
7215 if (i < sc->min_registers_per_thread) {
7216 sc->
tempLen = sprintf(sc->
tempStr,
" if (((combinedID %% %" PRIu64
")%%2) == 0) {\n", 2 * sc->
fftDim);
7227 sc->
tempLen = sprintf(sc->
tempStr,
" if (((combinedID %% %" PRIu64
")%%2) == 0) {\n", 2 * sc->
fftDim);
7246 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = ((combinedID %% %" PRIu64
")/2) * sharedStride + (combinedID / %" PRIu64
");\n", 2 * sc->
fftDim, 2 * sc->
fftDim);
7250 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = ((combinedID %% %" PRIu64
")/2) + (combinedID / %" PRIu64
") * sharedStride;\n", 2 * sc->
fftDim, 2 * sc->
fftDim);
7254 sc->
tempLen = sprintf(sc->
tempStr,
" if (((combinedID %% %" PRIu64
")%%2) == 0) {\n", 2 * sc->
fftDim);
7263 sc->
tempLen = sprintf(sc->
tempStr,
" if (((combinedID %% %" PRIu64
")%%2) == 1) {\n", 2 * sc->
fftDim);
7330 sc->
tempLen = sprintf(sc->
tempStr,
" if(combinedID < %" PRIu64
"){\n", maxBluesteinCutOff);
7351 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = ((combinedID %% %" PRIu64
")/2) * sharedStride + (combinedID / %" PRIu64
");\n", 2 * sc->
fftDim, 2 * sc->
fftDim);
7355 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = ((combinedID %% %" PRIu64
")/2) + (combinedID / %" PRIu64
") * sharedStride;\n", 2 * sc->
fftDim, 2 * sc->
fftDim);
7359 if (i < sc->min_registers_per_thread) {
7360 sc->
tempLen = sprintf(sc->
tempStr,
" if (((combinedID %% %" PRIu64
")%%2) == 1) {\n", 2 * sc->
fftDim);
7371 sc->
tempLen = sprintf(sc->
tempStr,
" if (((combinedID %% %" PRIu64
")%%2) == 1) {\n", 2 * sc->
fftDim);
7390 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = ((combinedID %% %" PRIu64
")/2) * sharedStride + (combinedID / %" PRIu64
");\n", 2 * sc->
fftDim, 2 * sc->
fftDim);
7394 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = ((combinedID %% %" PRIu64
")/2) + (combinedID / %" PRIu64
") * sharedStride;\n", 2 * sc->
fftDim, 2 * sc->
fftDim);
7398 sc->
tempLen = sprintf(sc->
tempStr,
" if (((combinedID %% %" PRIu64
")%%2) == 0) {\n", 2 * sc->
fftDim);
7407 sc->
tempLen = sprintf(sc->
tempStr,
" if (((combinedID %% %" PRIu64
")%%2) == 1) {\n", 2 * sc->
fftDim);
7463 sc->
tempLen = sprintf(sc->
tempStr,
" if(combinedID < %" PRIu64
"){\n", maxBluesteinCutOff);
7468 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
");\n", sc->
fftDim, sc->
fftDim);
7471 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride;\n", sc->
fftDim, sc->
fftDim);
7479 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = sdata[sdataID-sharedStride].y;\n", sc->
w);
7482 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = sdata[sdataID-1].y;\n", sc->
w);
7486 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = sdata[sdataID].x;\n", sc->
w);
7502 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (%" PRIu64
") * sharedStride + (combinedID / %" PRIu64
");\n", sc->
fftDim - 1, sc->
fftDim);
7505 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (%" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride;\n", sc->
fftDim - 1, sc->
fftDim);
7542 sc->
tempLen = sprintf(sc->
tempStr,
" if(combinedID < %" PRIu64
"){\n", maxBluesteinCutOff);
7547 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
");\n", sc->
fftDim, sc->
fftDim);
7550 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride;\n", sc->
fftDim, sc->
fftDim);
7560#if(VKFFT_BACKEND!=3)
7562 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (%" PRIu64
" - combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
");\n", sc->
fftDim, sc->
fftDim, sc->
fftDim);
7565 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (%" PRIu64
" - combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride;\n", sc->
fftDim, sc->
fftDim, sc->
fftDim);
7596#if(VKFFT_BACKEND==3)
7609 sc->
tempLen = sprintf(sc->
tempStr,
" if(combinedID < %" PRIu64
"){\n", maxBluesteinCutOff);
7617 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (%" PRIu64
" - combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
");\n", sc->
fftDim, sc->
fftDim, sc->
fftDim);
7620 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (%" PRIu64
" - combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride;\n", sc->
fftDim, sc->
fftDim, sc->
fftDim);
7650 for (uint64_t i = 0; i < num_in; i++) {
7678 sc->
tempLen = sprintf(sc->
tempStr,
" mult.x = %s(%.17f%s * (combinedID %% %" PRIu64
") );\n", cosDef, double_PI / 2 / sc->
fftDim, LFending, sc->
fftDim / 2 + 1);
7681 sc->
tempLen = sprintf(sc->
tempStr,
" mult.y = %s(%.17f%s * (combinedID %% %" PRIu64
") );\n", sinDef, double_PI / 2 / sc->
fftDim, LFending, sc->
fftDim / 2 + 1);
7687 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
");\n", sc->
fftDim / 2 + 1, sc->
fftDim / 2 + 1);
7692 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride ;\n", sc->
fftDim / 2 + 1, sc->
fftDim / 2 + 1);
7700 sc->
tempLen = sprintf(sc->
tempStr,
" if (combinedID %% %" PRIu64
" > 0){\n", sc->
fftDim / 2 + 1);
7705 sc->
tempLen = sprintf(sc->
tempStr,
" inoutID = (%" PRIu64
" - combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
");\n", sc->
fftDim, sc->
fftDim / 2 + 1, sc->
fftDim / 2 + 1);
7710 sc->
tempLen = sprintf(sc->
tempStr,
" inoutID = (%" PRIu64
" - combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride ;\n", sc->
fftDim, sc->
fftDim / 2 + 1, sc->
fftDim / 2 + 1);
7736 sc->
tempLen = sprintf(sc->
tempStr,
" if (combinedID %% %" PRIu64
" == 0){\n", sc->
fftDim / 2 + 1);
7775 char shiftX[500] =
"";
7777 sprintf(shiftX,
" + consts.workGroupShiftX ");
7778 char shiftX2[500] =
"";
7781 char shiftY[500] =
"";
7783 sprintf(shiftY,
" + consts.workGroupShiftY ");
7816 res =
indexInputVkFFT(sc, uintType, readType, index_x, index_y, requestCoordinate, requestBatch);
7829#if(VKFFT_BACKEND!=3)
7831 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = %s%s[%s]%s;\n", sc->
regIDs[0], convTypeLeft, inputsStruct, sc->
inoutID, convTypeRight);
7837 if (i < sc->min_registers_per_thread) {
7854#if(VKFFT_BACKEND!=3)
7855 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = ((combinedID / %" PRIu64
")/2) * sharedStride + (combinedID %% %" PRIu64
");\n", sc->
localSize[0], sc->
localSize[0]);
7882 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = ((combinedID / %" PRIu64
")/2) * sharedStride + (combinedID %% %" PRIu64
");\n", sc->
localSize[0], sc->
localSize[0]);
7916#if(VKFFT_BACKEND==3)
7941 res =
indexInputVkFFT(sc, uintType, readType, index_x, index_y, requestCoordinate, requestBatch);
7954 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = ((combinedID / %" PRIu64
")/2) * sharedStride + (combinedID %% %" PRIu64
");\n", sc->
localSize[0], sc->
localSize[0]);
7958 if (i < sc->min_registers_per_thread) {
7987 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = ((combinedID / %" PRIu64
")/2) * sharedStride + (combinedID %% %" PRIu64
");\n", sc->
localSize[0], sc->
localSize[0]);
8047 res =
indexInputVkFFT(sc, uintType, readType, index_x, index_y, requestCoordinate, requestBatch);
8060 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = ((combinedID / %" PRIu64
")/2) * sharedStride + (combinedID %% %" PRIu64
");\n", sc->
localSize[0], sc->
localSize[0]);
8064 if (i < sc->min_registers_per_thread) {
8093 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = ((combinedID / %" PRIu64
")/2) * sharedStride + (combinedID %% %" PRIu64
");\n", sc->
localSize[0], sc->
localSize[0]);
8146 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (combinedID / %" PRIu64
") * sharedStride + (combinedID %% %" PRIu64
");\n", sc->
localSize[0], sc->
localSize[0]);
8153 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = sdata[sdataID-sharedStride].y;\n", sc->
w);
8157 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = sdata[sdataID].x;\n", sc->
w);
8172 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (%" PRIu64
") * sharedStride + (combinedID %% %" PRIu64
");\n", sc->
fftDim - 1, sc->
localSize[0]);
8212 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (combinedID / %" PRIu64
") * sharedStride + (combinedID %% %" PRIu64
");\n", sc->
localSize[0], sc->
localSize[0]);
8222#if(VKFFT_BACKEND!=3)
8223 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (%" PRIu64
" - combinedID / %" PRIu64
") * sharedStride + (combinedID %% %" PRIu64
");\n", sc->
fftDim, sc->
localSize[0], sc->
localSize[0]);
8251#if(VKFFT_BACKEND==3)
8271 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (%" PRIu64
" - combinedID / %" PRIu64
") * sharedStride + (combinedID %% %" PRIu64
");\n", sc->
fftDim, sc->
localSize[0], sc->
localSize[0]);
8295 uint64_t num_in = (uint64_t)ceil((sc->
fftDim / 2 + 1) / (double)sc->
localSize[1]);
8298 for (uint64_t i = 0; i < num_in; i++) {
8318 sc->
tempLen = sprintf(sc->
tempStr,
" mult.x = %s(%.17f%s * (combinedID / %" PRIu64
") );\n", cosDef, double_PI / 2 / sc->
fftDim, LFending, sc->
localSize[0]);
8321 sc->
tempLen = sprintf(sc->
tempStr,
" mult.y = %s(%.17f%s * (combinedID / %" PRIu64
") );\n", sinDef, double_PI / 2 / sc->
fftDim, LFending, sc->
localSize[0]);
8326 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (combinedID / %" PRIu64
") * sharedStride + (combinedID %% %" PRIu64
");\n", sc->
localSize[0], sc->
localSize[0]);
8338 sc->
tempLen = sprintf(sc->
tempStr,
" inoutID = (%" PRIu64
" - combinedID / %" PRIu64
") * sharedStride + (combinedID %% %" PRIu64
");\n", sc->
fftDim, sc->
localSize[0], sc->
localSize[0]);
8391 char shiftX[500] =
"";
8393 sprintf(shiftX,
" + consts.workGroupShiftX ");
8394 char shiftY[500] =
"";
8396 sprintf(shiftY,
" + consts.workGroupShiftY ");
8422 if ((uint64_t)ceil(sc->
size[1] / (
double)mult) % sc->
localSize[0] != 0) {
8434 if ((uint64_t)ceil(sc->
size[1] / (
double)mult) % sc->
localSize[1] != 0) {
8461 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
");\n", sc->
fftDim, sc->
fftDim);
8466 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[sdataID].x = %s%s[%s]%s;\n", convTypeLeft, inputsStruct, sc->
inoutID, convTypeRight);
8478 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[sdataID].y = %s%s[inoutID]%s;\n", convTypeLeft, inputsStruct, convTypeRight);
8494 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride ;\n", sc->
fftDim, sc->
fftDim);
8498 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[sdataID].x = %s%s[inoutID]%s;\n", convTypeLeft, inputsStruct, convTypeRight);
8508 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[sdataID].y = %s%s[inoutID]%s;\n", convTypeLeft, inputsStruct, convTypeRight);
8536 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
");\n", sc->
fftDim, sc->
fftDim);
8541 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride ;\n", sc->
fftDim, sc->
fftDim);
8557 if ((uint64_t)ceil(sc->
size[1] / (
double)mult) % sc->
localSize[0] != 0) {
8564 if ((uint64_t)ceil(sc->
size[1] / (
double)mult) % sc->
localSize[1] != 0) {
8596 sc->
tempLen = sprintf(sc->
tempStr,
" inoutID = %" PRIu64
" + 4 * (combinedID %% %" PRIu64
");\n", sc->
fftDim / 2, sc->
fftDim);
8600 sc->
tempLen = sprintf(sc->
tempStr,
" if (inoutID < %" PRIu64
") sdataID = inoutID;\n", sc->
fftDim);
8603 sc->
tempLen = sprintf(sc->
tempStr,
" if ((inoutID < %" PRIu64
")&&(inoutID >= %" PRIu64
")) sdataID = %" PRIu64
" - inoutID;\n", 2 * sc->
fftDim, sc->
fftDim, 2 * sc->
fftDim - 1);
8606 sc->
tempLen = sprintf(sc->
tempStr,
" if ((inoutID < %" PRIu64
")&&(inoutID >= %" PRIu64
")) sdataID = inoutID - %" PRIu64
";\n", 3 * sc->
fftDim, 2 * sc->
fftDim, 2 * sc->
fftDim);
8609 sc->
tempLen = sprintf(sc->
tempStr,
" if ((inoutID < %" PRIu64
")&&(inoutID >= %" PRIu64
")) sdataID = %" PRIu64
" - inoutID;\n", 4 * sc->
fftDim, 3 * sc->
fftDim, 4 * sc->
fftDim - 1);
8612 sc->
tempLen = sprintf(sc->
tempStr,
" if (inoutID >= %" PRIu64
") sdataID = inoutID - %" PRIu64
";\n", 4 * sc->
fftDim, 4 * sc->
fftDim);
8621 sc->
tempLen = sprintf(sc->
tempStr,
" if ((inoutID < %" PRIu64
")&&(inoutID >= %" PRIu64
")){ \n\
8626 sc->
tempLen = sprintf(sc->
tempStr,
" if ((inoutID < %" PRIu64
")&&(inoutID >= %" PRIu64
")){ \n\
8655 sc->
tempLen = sprintf(sc->
tempStr,
" inoutID = %" PRIu64
" + 4 * combinedID;\n", sc->
fftDim / 2);
8659 sc->
tempLen = sprintf(sc->
tempStr,
" if (inoutID < %" PRIu64
") sdataID = inoutID;\n", sc->
fftDim);
8662 sc->
tempLen = sprintf(sc->
tempStr,
" if ((inoutID < %" PRIu64
")&&(inoutID >= %" PRIu64
")) sdataID = %" PRIu64
" - inoutID;\n", 2 * sc->
fftDim, sc->
fftDim, 2 * sc->
fftDim - 1);
8665 sc->
tempLen = sprintf(sc->
tempStr,
" if ((inoutID < %" PRIu64
")&&(inoutID >= %" PRIu64
")) sdataID = inoutID - %" PRIu64
";\n", 3 * sc->
fftDim, 2 * sc->
fftDim, 2 * sc->
fftDim);
8668 sc->
tempLen = sprintf(sc->
tempStr,
" if ((inoutID < %" PRIu64
")&&(inoutID >= %" PRIu64
")) sdataID = %" PRIu64
" - inoutID;\n", 4 * sc->
fftDim, 3 * sc->
fftDim, 4 * sc->
fftDim - 1);
8671 sc->
tempLen = sprintf(sc->
tempStr,
" if (inoutID >= %" PRIu64
") sdataID = inoutID - %" PRIu64
";\n", 4 * sc->
fftDim, 4 * sc->
fftDim);
8680 sc->
tempLen = sprintf(sc->
tempStr,
" if ((inoutID < %" PRIu64
")&&(inoutID >= %" PRIu64
")){ \n\
8685 sc->
tempLen = sprintf(sc->
tempStr,
" if ((inoutID < %" PRIu64
")&&(inoutID >= %" PRIu64
")){ \n\
8709 char shiftX[500] =
"";
8711 sprintf(shiftX,
" + consts.workGroupShiftX ");
8712 char shiftX2[500] =
"";
8715 char shiftY[500] =
"";
8717 sprintf(shiftY,
" + consts.workGroupShiftY ");
8734 if ((uint64_t)ceil(sc->
size[0] / (
double)mult) % sc->
localSize[0] != 0) {
8766 res =
indexInputVkFFT(sc, uintType, readType, index_x, index_y, requestCoordinate, requestBatch);
8779 sc->
tempLen = sprintf(sc->
tempStr,
" sdata[sdataID].x = %s%s[%s]%s;\n", convTypeLeft, inputsStruct, sc->
inoutID, convTypeRight);
8810 if ((uint64_t)ceil(sc->
size[0] / (
double)mult) % sc->
localSize[0] != 0) {
8840 sc->
tempLen = sprintf(sc->
tempStr,
" inoutID = %" PRIu64
" + 4 * combinedID;\n", sc->
fftDim / 2);
8844 sc->
tempLen = sprintf(sc->
tempStr,
" if (inoutID < %" PRIu64
") sdataID = inoutID;\n", sc->
fftDim);
8847 sc->
tempLen = sprintf(sc->
tempStr,
" if ((inoutID < %" PRIu64
")&&(inoutID >= %" PRIu64
")) sdataID = %" PRIu64
" - inoutID;\n", 2 * sc->
fftDim, sc->
fftDim, 2 * sc->
fftDim - 1);
8850 sc->
tempLen = sprintf(sc->
tempStr,
" if ((inoutID < %" PRIu64
")&&(inoutID >= %" PRIu64
")) sdataID = inoutID - %" PRIu64
";\n", 3 * sc->
fftDim, 2 * sc->
fftDim, 2 * sc->
fftDim);
8853 sc->
tempLen = sprintf(sc->
tempStr,
" if ((inoutID < %" PRIu64
")&&(inoutID >= %" PRIu64
")) sdataID = %" PRIu64
" - inoutID;\n", 4 * sc->
fftDim, 3 * sc->
fftDim, 4 * sc->
fftDim - 1);
8856 sc->
tempLen = sprintf(sc->
tempStr,
" if (inoutID >= %" PRIu64
") sdataID = inoutID - %" PRIu64
";\n", 4 * sc->
fftDim, 4 * sc->
fftDim);
8865 sc->
tempLen = sprintf(sc->
tempStr,
" if ((inoutID < %" PRIu64
")&&(inoutID >= %" PRIu64
")){ \n\
8870 sc->
tempLen = sprintf(sc->
tempStr,
" if ((inoutID < %" PRIu64
")&&(inoutID >= %" PRIu64
")){ \n\
8898 char LFending[4] =
"";
8899 if (!strcmp(floatType,
"float")) sprintf(LFending,
"f");
8900#if(VKFFT_BACKEND==0)
8901 if (!strcmp(floatType,
"float")) sprintf(vecType,
"vec2");
8902 if (!strcmp(floatType,
"double")) sprintf(vecType,
"dvec2");
8903 char cosDef[20] =
"cos";
8904 char sinDef[20] =
"sin";
8905 if (!strcmp(floatType,
"double")) sprintf(LFending,
"LF");
8906#elif(VKFFT_BACKEND==1)
8907 if (!strcmp(floatType,
"float")) sprintf(vecType,
"float2");
8908 if (!strcmp(floatType,
"double")) sprintf(vecType,
"double2");
8909 char cosDef[20] =
"__cosf";
8910 char sinDef[20] =
"__sinf";
8911 if (!strcmp(floatType,
"double")) sprintf(LFending,
"l");
8912#elif(VKFFT_BACKEND==2)
8913 if (!strcmp(floatType,
"float")) sprintf(vecType,
"float2");
8914 if (!strcmp(floatType,
"double")) sprintf(vecType,
"double2");
8915 char cosDef[20] =
"__cosf";
8916 char sinDef[20] =
"__sinf";
8917 if (!strcmp(floatType,
"double")) sprintf(LFending,
"l");
8918#elif(VKFFT_BACKEND==3)
8919 if (!strcmp(floatType,
"float")) sprintf(vecType,
"float2");
8920 if (!strcmp(floatType,
"double")) sprintf(vecType,
"double2");
8921 char cosDef[20] =
"native_cos";
8922 char sinDef[20] =
"native_sin";
8927 switch (reorderType) {
8929 char shiftX[500] =
"";
8949 uint64_t
id = (i / logicalRegistersPerThread) * sc->
registers_per_thread + i % logicalRegistersPerThread;
8964 if (!strcmp(floatType,
"float")) {
8965 sc->
tempLen = sprintf(sc->
tempStr,
" mult.x = %s(angle);\n", cosDef);
8968 sc->
tempLen = sprintf(sc->
tempStr,
" mult.y = %s(angle);\n", sinDef);
8973 if (!strcmp(floatType,
"double")) {
8974 sc->
tempLen = sprintf(sc->
tempStr,
" mult = sincos_20(angle);\n");
8981 w.x = %s.x * mult.x - %s.y * mult.y;\n", sc->
regIDs[
id], sc->
regIDs[
id]);
8985 %s.y = %s.y * mult.x + %s.x * mult.y;\n", sc->
regIDs[
id], sc->
regIDs[
id], sc->
regIDs[
id]);
8989 %s.x = w.x;\n", sc->
regIDs[
id]);
9000 w.x = sdata[%s].x * mult.x - sdata[%s].y * mult.y;\n", sc->
inoutID, sc->
inoutID);
9005 sdata[%s].y = sdata[%s].y * mult.x + sdata[%s].x * mult.y;\n", sc->
inoutID, sc->
inoutID, sc->
inoutID);
9009 sdata[%s].x = w.x;\n", sc->
inoutID);
9023 char shiftX[500] =
"";
9042 uint64_t
id = (i / logicalRegistersPerThread) * sc->
registers_per_thread + i % logicalRegistersPerThread;
9058 if (!strcmp(floatType,
"float")) {
9059 sc->
tempLen = sprintf(sc->
tempStr,
" mult.x = %s(angle);\n", cosDef);
9062 sc->
tempLen = sprintf(sc->
tempStr,
" mult.y = %s(angle);\n", sinDef);
9067 if (!strcmp(floatType,
"double")) {
9068 sc->
tempLen = sprintf(sc->
tempStr,
" mult = sincos_20(angle);\n");
9075 w.x = %s.x * mult.x - %s.y * mult.y;\n", sc->
regIDs[
id], sc->
regIDs[
id]);
9079 %s.y = %s.y * mult.x + %s.x * mult.y;\n", sc->
regIDs[
id], sc->
regIDs[
id], sc->
regIDs[
id]);
9083 %s.x = w.x;\n", sc->
regIDs[
id]);
9094 w.x = sdata[%s].x * mult.x - sdata[%s].y * mult.y;\n", sc->
inoutID, sc->
inoutID);
9099 sdata[%s].y = sdata[%s].y * mult.x + sdata[%s].x * mult.y;\n", sc->
inoutID, sc->
inoutID, sc->
inoutID);
9103 sdata[%s].x = w.x;\n", sc->
inoutID);
9122 char LFending[4] =
"";
9123 if (!strcmp(floatType,
"float")) sprintf(LFending,
"f");
9124#if(VKFFT_BACKEND==0)
9125 if (!strcmp(floatType,
"float")) sprintf(vecType,
"vec2");
9126 if (!strcmp(floatType,
"double")) sprintf(vecType,
"dvec2");
9127 char cosDef[20] =
"cos";
9128 char sinDef[20] =
"sin";
9129 if (!strcmp(floatType,
"double")) sprintf(LFending,
"LF");
9130#elif(VKFFT_BACKEND==1)
9131 if (!strcmp(floatType,
"float")) sprintf(vecType,
"float2");
9132 if (!strcmp(floatType,
"double")) sprintf(vecType,
"double2");
9133 char cosDef[20] =
"__cosf";
9134 char sinDef[20] =
"__sinf";
9135 if (!strcmp(floatType,
"double")) sprintf(LFending,
"l");
9136#elif(VKFFT_BACKEND==2)
9137 if (!strcmp(floatType,
"float")) sprintf(vecType,
"float2");
9138 if (!strcmp(floatType,
"double")) sprintf(vecType,
"double2");
9139 char cosDef[20] =
"__cosf";
9140 char sinDef[20] =
"__sinf";
9141 if (!strcmp(floatType,
"double")) sprintf(LFending,
"l");
9142#elif(VKFFT_BACKEND==3)
9143 if (!strcmp(floatType,
"float")) sprintf(vecType,
"float2");
9144 if (!strcmp(floatType,
"double")) sprintf(vecType,
"double2");
9145 char cosDef[20] =
"native_cos";
9146 char sinDef[20] =
"native_sin";
9151 switch (reorderType) {
9153 char shiftX[500] =
"";
9173 uint64_t
id = (i / logicalRegistersPerThread) * sc->
registers_per_thread + i % logicalRegistersPerThread;
9189 if (!strcmp(floatType,
"float")) {
9190 sc->
tempLen = sprintf(sc->
tempStr,
" mult.x = %s(angle);\n", cosDef);
9193 sc->
tempLen = sprintf(sc->
tempStr,
" mult.y = %s(angle);\n", sinDef);
9198 if (!strcmp(floatType,
"double")) {
9199 sc->
tempLen = sprintf(sc->
tempStr,
" mult = sincos_20(angle);\n");
9205 if (!strcmp(floatType,
"float")) {
9206 sc->
tempLen = sprintf(sc->
tempStr,
" mult.x = %s(angle);\n", cosDef);
9209 sc->
tempLen = sprintf(sc->
tempStr,
" mult.y = -%s(angle);\n", sinDef);
9214 if (!strcmp(floatType,
"double")) {
9215 sc->
tempLen = sprintf(sc->
tempStr,
" mult = sincos_20(-angle);\n");
9223 w.x = %s.x * mult.x - %s.y * mult.y;\n", sc->
regIDs[
id], sc->
regIDs[
id]);
9227 %s.y = %s.y * mult.x + %s.x * mult.y;\n", sc->
regIDs[
id], sc->
regIDs[
id], sc->
regIDs[
id]);
9231 %s.x = w.x;\n", sc->
regIDs[
id]);
9242 w.x = sdata[%s].x * mult.x - sdata[%s].y * mult.y;\n", sc->
inoutID, sc->
inoutID);
9247 sdata[%s].y = sdata[%s].y * mult.x + sdata[%s].x * mult.y;\n", sc->
inoutID, sc->
inoutID, sc->
inoutID);
9251 sdata[%s].x = w.x;\n", sc->
inoutID);
9264 char shiftX[500] =
"";
9284 uint64_t
id = (i / logicalRegistersPerThread) * sc->
registers_per_thread + i % logicalRegistersPerThread;
9300 if (!strcmp(floatType,
"float")) {
9301 sc->
tempLen = sprintf(sc->
tempStr,
" mult.x = %s(angle);\n", cosDef);
9304 sc->
tempLen = sprintf(sc->
tempStr,
" mult.y = %s(angle);\n", sinDef);
9309 if (!strcmp(floatType,
"double")) {
9310 sc->
tempLen = sprintf(sc->
tempStr,
" mult = sincos_20(angle);\n");
9316 if (!strcmp(floatType,
"float")) {
9317 sc->
tempLen = sprintf(sc->
tempStr,
" mult.x = %s(angle);\n", cosDef);
9320 sc->
tempLen = sprintf(sc->
tempStr,
" mult.y = -%s(angle);\n", sinDef);
9325 if (!strcmp(floatType,
"double")) {
9326 sc->
tempLen = sprintf(sc->
tempStr,
" mult = sincos_20(-angle);\n");
9334 w.x = %s.x * mult.x - %s.y * mult.y;\n", sc->
regIDs[
id], sc->
regIDs[
id]);
9338 %s.y = %s.y * mult.x + %s.x * mult.y;\n", sc->
regIDs[
id], sc->
regIDs[
id], sc->
regIDs[
id]);
9342 %s.x = w.x;\n", sc->
regIDs[
id]);
9353 w.x = sdata[%s].x * mult.x - sdata[%s].y * mult.y;\n", sc->
inoutID, sc->
inoutID);
9358 sdata[%s].y = sdata[%s].y * mult.x + sdata[%s].x * mult.y;\n", sc->
inoutID, sc->
inoutID, sc->
inoutID);
9362 sdata[%s].x = w.x;\n", sc->
inoutID);
9382 char LFending[4] =
"";
9383 if (!strcmp(floatType,
"float")) sprintf(LFending,
"f");
9384#if(VKFFT_BACKEND==0)
9385 if (!strcmp(floatType,
"float")) sprintf(vecType,
"vec2");
9386 if (!strcmp(floatType,
"double")) sprintf(vecType,
"dvec2");
9387 char cosDef[20] =
"cos";
9388 char sinDef[20] =
"sin";
9389 if (!strcmp(floatType,
"double")) sprintf(LFending,
"LF");
9390#elif(VKFFT_BACKEND==1)
9391 if (!strcmp(floatType,
"float")) sprintf(vecType,
"float2");
9392 if (!strcmp(floatType,
"double")) sprintf(vecType,
"double2");
9393 char cosDef[20] =
"__cosf";
9394 char sinDef[20] =
"__sinf";
9395 if (!strcmp(floatType,
"double")) sprintf(LFending,
"l");
9396#elif(VKFFT_BACKEND==2)
9397 if (!strcmp(floatType,
"float")) sprintf(vecType,
"float2");
9398 if (!strcmp(floatType,
"double")) sprintf(vecType,
"double2");
9399 char cosDef[20] =
"__cosf";
9400 char sinDef[20] =
"__sinf";
9401 if (!strcmp(floatType,
"double")) sprintf(LFending,
"l");
9402#elif(VKFFT_BACKEND==3)
9403 if (!strcmp(floatType,
"float")) sprintf(vecType,
"float2");
9404 if (!strcmp(floatType,
"double")) sprintf(vecType,
"double2");
9405 char cosDef[20] =
"native_cos";
9406 char sinDef[20] =
"native_sin";
9409 char shiftX[500] =
"";
9412 char requestCoordinate[100] =
"";
9414 char index_x[2000] =
"";
9415 char index_y[2000] =
"";
9416 char requestBatch[100] =
"";
9417 char separateRegisterStore[100] =
"";
9418 char kernelName[100] =
"";
9419 sprintf(kernelName,
"BluesteinMultiplication");
9427 switch (strideType) {
9428 case 0:
case 2:
case 5:
case 6:
case 110:
case 120:
case 130:
case 140:
case 142:
case 144:
9437 sprintf(index_x,
" (%s%s) %% (%" PRIu64
") + %" PRIu64
" * (%s + %" PRIu64
") + ((%s%s) / %" PRIu64
") * (%" PRIu64
")", sc->
gl_GlobalInvocationID_x, shiftX, sc->
stageStartSize, sc->
stageStartSize, sc->
gl_LocalInvocationID_y, (i)*sc->
localSize[1], sc->
gl_GlobalInvocationID_x, shiftX, sc->
stageStartSize, sc->
stageStartSize * sc->
fftDim);
9445 case 1:
case 111:
case 121:
case 131:
case 141:
case 143:
case 145:
9453 sc->
tempLen = sprintf(sc->
tempStr,
" %s = (%" PRIu64
" * (%s + %" PRIu64
") + ((%s%s) / %" PRIu64
") %% (%" PRIu64
")+((%s%s) / %" PRIu64
") * (%" PRIu64
"));\n", sc->
inoutID, sc->
stageStartSize, sc->
gl_LocalInvocationID_y, (i)*sc->
localSize[1], sc->
gl_GlobalInvocationID_x, shiftX, sc->
fft_dim_x, sc->
stageStartSize, sc->
gl_GlobalInvocationID_x, shiftX, sc->
fft_dim_x * sc->
stageStartSize, sc->
fftDim * sc->
stageStartSize);
9475 if ((strideType == 0) || (strideType == 5) || (strideType == 6) || (strideType == 110) || (strideType == 120) || (strideType == 130) || (strideType == 140) || (strideType == 142) || (strideType == 144)) {
9496 if ((strideType == 0) || (strideType == 5) || (strideType == 6) || (strideType == 110) || (strideType == 120) || (strideType == 130) || (strideType == 140) || (strideType == 142) || (strideType == 144)) {
9528 char LFending[4] =
"";
9529 if (!strcmp(floatType,
"float")) sprintf(LFending,
"f");
9530#if(VKFFT_BACKEND==0)
9531 if (!strcmp(floatType,
"float")) sprintf(vecType,
"vec2");
9532 if (!strcmp(floatType,
"double")) sprintf(vecType,
"dvec2");
9533 if (!strcmp(floatType,
"double")) sprintf(LFending,
"LF");
9534#elif(VKFFT_BACKEND==1)
9535 if (!strcmp(floatType,
"float")) sprintf(vecType,
"float2");
9536 if (!strcmp(floatType,
"double")) sprintf(vecType,
"double2");
9537 if (!strcmp(floatType,
"double")) sprintf(LFending,
"l");
9538#elif(VKFFT_BACKEND==2)
9539 if (!strcmp(floatType,
"float")) sprintf(vecType,
"float2");
9540 if (!strcmp(floatType,
"double")) sprintf(vecType,
"double2");
9541 if (!strcmp(floatType,
"double")) sprintf(LFending,
"l");
9542#elif(VKFFT_BACKEND==3)
9543 if (!strcmp(floatType,
"float")) sprintf(vecType,
"float2");
9544 if (!strcmp(floatType,
"double")) sprintf(vecType,
"double2");
9548 char convolutionInverse[10] =
"";
9551 sprintf(convolutionInverse,
", 0");
9553 sprintf(convolutionInverse,
", 1");
9557 uint64_t logicalGroupSize = sc->
fftDim / logicalStoragePerThread;
9575 for (uint64_t j = 0; j < logicalRegistersPerThread / stageRadix; j++) {
9581 sc->
tempLen = sprintf(sc->
tempStr,
" LUTId = stageInvocationID + %" PRIu64
";\n", stageSizeSum);
9583 sc->
tempLen = sprintf(sc->
tempStr,
" angle = stageInvocationID * %.17f%s;\n", stageAngle, LFending);
9588 for (uint64_t i = 0; i < stageRadix; i++) {
9589 uint64_t
id = j + i * logicalRegistersPerThread / stageRadix;
9590 id = (
id / logicalRegistersPerThread) * sc->
registers_per_thread +
id % logicalRegistersPerThread;
9616 char** regID = (
char**)malloc(
sizeof(
char*) * stageRadix);
9618 for (uint64_t i = 0; i < stageRadix; i++) {
9619 regID[i] = (
char*)malloc(
sizeof(
char) * 50);
9621 for (uint64_t j = 0; j < i; j++) {
9629 uint64_t
id = j + k * logicalRegistersPerThread / stageRadix + i * logicalStoragePerThread / stageRadix;
9630 id = (
id / logicalRegistersPerThread) * sc->
registers_per_thread +
id % logicalRegistersPerThread;
9631 sprintf(regID[i],
"%s", sc->
regIDs[
id]);
9640 for (uint64_t i = 0; i < stageRadix; i++) {
9641 uint64_t
id = j + k * logicalRegistersPerThread / stageRadix + i * logicalStoragePerThread / stageRadix;
9642 id = (
id / logicalRegistersPerThread) * sc->
registers_per_thread +
id % logicalRegistersPerThread;
9643 sprintf(sc->
regIDs[
id],
"%s", regID[i]);
9645 for (uint64_t i = 0; i < stageRadix; i++) {
9656 for (uint64_t i = 0; i < logicalRegistersPerThread; i++) {
9657 uint64_t
id = i + k * logicalRegistersPerThread;
9658 id = (
id / logicalRegistersPerThread) * sc->
registers_per_thread +
id % logicalRegistersPerThread;
9660 shuffle[%" PRIu64
"]=%s;\n", i, sc->
regIDs[
id]);
9664 for (uint64_t i = 0; i < logicalRegistersPerThread; i++) {
9665 uint64_t
id = i + k * logicalRegistersPerThread;
9666 id = (
id / logicalRegistersPerThread) * sc->
registers_per_thread +
id % logicalRegistersPerThread;
9668 %s=shuffle[(%" PRIu64
"+tshuffle)%%(%" PRIu64
")];\n", sc->
regIDs[
id], i, logicalRegistersPerThread);
9688 char LFending[4] =
"";
9689 if (!strcmp(floatType,
"float")) sprintf(LFending,
"f");
9690#if(VKFFT_BACKEND==0)
9691 if (!strcmp(floatType,
"float")) sprintf(vecType,
"vec2");
9692 if (!strcmp(floatType,
"double")) sprintf(vecType,
"dvec2");
9693 if (!strcmp(floatType,
"double")) sprintf(LFending,
"LF");
9694#elif(VKFFT_BACKEND==1)
9695 if (!strcmp(floatType,
"float")) sprintf(vecType,
"float2");
9696 if (!strcmp(floatType,
"double")) sprintf(vecType,
"double2");
9697 if (!strcmp(floatType,
"double")) sprintf(LFending,
"l");
9698#elif(VKFFT_BACKEND==2)
9699 if (!strcmp(floatType,
"float")) sprintf(vecType,
"float2");
9700 if (!strcmp(floatType,
"double")) sprintf(vecType,
"double2");
9701 if (!strcmp(floatType,
"double")) sprintf(LFending,
"l");
9702#elif(VKFFT_BACKEND==3)
9703 if (!strcmp(floatType,
"float")) sprintf(vecType,
"float2");
9704 if (!strcmp(floatType,
"double")) sprintf(vecType,
"double2");
9708 char convolutionInverse[10] =
"";
9711 sprintf(convolutionInverse,
", 0");
9713 sprintf(convolutionInverse,
", 1");
9717 uint64_t logicalGroupSize = sc->
fftDim / logicalStoragePerThread;
9734 for (uint64_t j = 0; j < logicalRegistersPerThread / stageRadix; j++) {
9740 sc->
tempLen = sprintf(sc->
tempStr,
" LUTId = stageInvocationID + %" PRIu64
";\n", stageSizeSum);
9742 sc->
tempLen = sprintf(sc->
tempStr,
" angle = stageInvocationID * %.17f%s;\n", stageAngle, LFending);
9746 for (uint64_t i = 0; i < stageRadix; i++) {
9747 uint64_t
id = j + i * logicalRegistersPerThread / stageRadix;
9748 id = (
id / logicalRegistersPerThread) * sc->
registers_per_thread +
id % logicalRegistersPerThread;
9756 char** regID = (
char**)malloc(
sizeof(
char*) * stageRadix);
9758 for (uint64_t i = 0; i < stageRadix; i++) {
9759 regID[i] = (
char*)malloc(
sizeof(
char) * 50);
9761 for (uint64_t j = 0; j < i; j++) {
9769 uint64_t
id = j + k * logicalRegistersPerThread / stageRadix + i * logicalStoragePerThread / stageRadix;
9770 id = (
id / logicalRegistersPerThread) * sc->
registers_per_thread +
id % logicalRegistersPerThread;
9771 sprintf(regID[i],
"%s", sc->
regIDs[
id]);
9780 for (uint64_t i = 0; i < stageRadix; i++) {
9781 uint64_t
id = j + k * logicalRegistersPerThread / stageRadix + i * logicalStoragePerThread / stageRadix;
9782 id = (
id / logicalRegistersPerThread) * sc->
registers_per_thread +
id % logicalRegistersPerThread;
9783 sprintf(sc->
regIDs[
id],
"%s", regID[i]);
9785 for (uint64_t i = 0; i < stageRadix; i++) {
9805 if (stageSize == 1) {
9814 switch (shuffleType) {
9815 case 0:
case 5:
case 6:
case 110:
case 120:
case 130:
case 140:
case 142:
case 144: {
9821 case 1:
case 2:
case 111:
case 121:
case 131:
case 141:
case 143:
case 145: {
9859 char LFending[4] =
"";
9860 if (!strcmp(floatType,
"float")) sprintf(LFending,
"f");
9861#if(VKFFT_BACKEND==0)
9862 if (!strcmp(floatType,
"float")) sprintf(vecType,
"vec2");
9863 if (!strcmp(floatType,
"double")) sprintf(vecType,
"dvec2");
9864 if (!strcmp(floatType,
"double")) sprintf(LFending,
"LF");
9865#elif(VKFFT_BACKEND==1)
9866 if (!strcmp(floatType,
"float")) sprintf(vecType,
"float2");
9867 if (!strcmp(floatType,
"double")) sprintf(vecType,
"double2");
9868 if (!strcmp(floatType,
"double")) sprintf(LFending,
"l");
9869#elif(VKFFT_BACKEND==2)
9870 if (!strcmp(floatType,
"float")) sprintf(vecType,
"float2");
9871 if (!strcmp(floatType,
"double")) sprintf(vecType,
"double2");
9872 if (!strcmp(floatType,
"double")) sprintf(LFending,
"l");
9873#elif(VKFFT_BACKEND==3)
9874 if (!strcmp(floatType,
"float")) sprintf(vecType,
"float2");
9875 if (!strcmp(floatType,
"double")) sprintf(vecType,
"double2");
9877 char stageNormalization[50] =
"";
9878 uint64_t normalizationValue = 1;
9892 if (normalizationValue != 1) {
9893 sprintf(stageNormalization,
"%.17f%s", 1.0 / (
double)(normalizationValue), LFending);
9895 char tempNum[50] =
"";
9902 uint64_t logicalGroupSize = sc->
fftDim / logicalStoragePerThread;
9903 uint64_t logicalGroupSizeNext = sc->
fftDim / logicalStoragePerThreadNext;
9916 tempID[i] = (
char*)malloc(
sizeof(
char) * 50);
9918 for (uint64_t j = 0; j < i; j++) {
9953 for (uint64_t j = 0; j < logicalRegistersPerThread / stageRadix; j++) {
9954 sprintf(tempNum,
"%" PRIu64
"", j * logicalGroupSize);
9959 sprintf(tempNum,
"%" PRIu64
"", stageSize);
9964 sprintf(tempNum,
"%" PRIu64
"", stageRadix);
9974 for (uint64_t i = 0; i < stageRadix; i++) {
9975 uint64_t
id = j + k * logicalRegistersPerThread / stageRadix + i * logicalStoragePerThread / stageRadix;
9976 id = (
id / logicalRegistersPerThread) * sc->
registers_per_thread +
id % logicalRegistersPerThread;
9979 sprintf(tempNum,
"%" PRIu64
"", i);
9982 sprintf(tempNum,
"%" PRIu64
"", logicalRegistersPerThread);
9985 sprintf(tempNum,
"%" PRIu64
"", stageSize);
9998 if (strcmp(stageNormalization,
"")) {
10009 for (uint64_t i = 0; i < stageRadix; i++) {
10010 uint64_t
id = j + k * logicalRegistersPerThread / stageRadix + i * logicalStoragePerThread / stageRadix;
10011 id = (
id / logicalRegistersPerThread) * sc->
registers_per_thread +
id % logicalRegistersPerThread;
10014 sprintf(tempNum,
"%" PRIu64
"", i * stageSize);
10047 if (strcmp(stageNormalization,
"")) {
10086 for (uint64_t j = 0; j < logicalRegistersPerThreadNext / stageRadixNext; j++) {
10087 for (uint64_t i = 0; i < stageRadixNext; i++) {
10088 uint64_t
id = j + k * logicalRegistersPerThreadNext / stageRadixNext + i * logicalStoragePerThreadNext / stageRadixNext;
10089 id = (
id / logicalRegistersPerThreadNext) * sc->
registers_per_thread +
id % logicalRegistersPerThreadNext;
10091 sprintf(tempNum,
"%" PRIu64
"", t * logicalGroupSizeNext);
10135 sprintf(sc->
regIDs[i],
"%s", tempID[i]);
10155 tempID[i] = (
char*)malloc(
sizeof(
char) * 50);
10157 for (uint64_t j = 0; j < i; j++) {
10167 for (uint64_t j = 0; j < logicalRegistersPerThread / stageRadix; j++) {
10168 for (uint64_t i = 0; i < stageRadix; i++) {
10169 uint64_t
id = j + k * logicalRegistersPerThread / stageRadix + i * logicalStoragePerThread / stageRadix;
10170 id = (
id / logicalRegistersPerThread) * sc->
registers_per_thread +
id % logicalRegistersPerThread;
10179 sprintf(sc->
regIDs[i],
"%s", tempID[i]);
10204 for (uint64_t i = 0; i < logicalStoragePerThread; i++) {
10205 if (strcmp(stageNormalization,
"")) {
10229 char LFending[4] =
"";
10230 if (!strcmp(floatType,
"float")) sprintf(LFending,
"f");
10231#if(VKFFT_BACKEND==0)
10232 if (!strcmp(floatType,
"float")) sprintf(vecType,
"vec2");
10233 if (!strcmp(floatType,
"double")) sprintf(vecType,
"dvec2");
10234 if (!strcmp(floatType,
"double")) sprintf(LFending,
"LF");
10235#elif(VKFFT_BACKEND==1)
10236 if (!strcmp(floatType,
"float")) sprintf(vecType,
"float2");
10237 if (!strcmp(floatType,
"double")) sprintf(vecType,
"double2");
10238 if (!strcmp(floatType,
"double")) sprintf(LFending,
"l");
10239#elif(VKFFT_BACKEND==2)
10240 if (!strcmp(floatType,
"float")) sprintf(vecType,
"float2");
10241 if (!strcmp(floatType,
"double")) sprintf(vecType,
"double2");
10242 if (!strcmp(floatType,
"double")) sprintf(LFending,
"l");
10243#elif(VKFFT_BACKEND==3)
10244 if (!strcmp(floatType,
"float")) sprintf(vecType,
"float2");
10245 if (!strcmp(floatType,
"double")) sprintf(vecType,
"double2");
10248 char tempNum[50] =
"";
10255 uint64_t logicalGroupSize = sc->
fftDim / logicalStoragePerThread;
10256 uint64_t logicalGroupSizeNext = sc->
fftDim / logicalStoragePerThreadNext;
10257 char stageNormalization[50] =
"";
10258 uint64_t normalizationValue = 1;
10272 if (normalizationValue != 1) {
10273 sprintf(stageNormalization,
"%.17f%s", 1.0 / (
double)(normalizationValue), LFending);
10280 if (stageSize == sc->
fftDim / stageRadix) {
10293 tempID[i] = (
char*)malloc(
sizeof(
char) * 50);
10295 for (uint64_t j = 0; j < i; j++) {
10330 for (uint64_t j = 0; j < logicalRegistersPerThread / stageRadix; j++) {
10331 sprintf(tempNum,
"%" PRIu64
"", j * logicalGroupSize);
10336 sprintf(tempNum,
"%" PRIu64
"", stageSize);
10341 sprintf(tempNum,
"%" PRIu64
"", stageRadix);
10350 for (uint64_t i = 0; i < stageRadix; i++) {
10351 uint64_t
id = j + k * logicalRegistersPerThread / stageRadix + i * logicalStoragePerThread / stageRadix;
10352 id = (
id / logicalRegistersPerThread) * sc->
registers_per_thread +
id % logicalRegistersPerThread;
10355 sprintf(tempNum,
"%" PRIu64
"", i * stageSize);
10363 if (strcmp(stageNormalization,
"")) {
10401 for (uint64_t j = 0; j < logicalRegistersPerThreadNext / stageRadixNext; j++) {
10402 for (uint64_t i = 0; i < stageRadixNext; i++) {
10403 uint64_t
id = j + k * logicalRegistersPerThreadNext / stageRadixNext + i * logicalRegistersPerThreadNext / stageRadixNext;
10404 id = (
id / logicalRegistersPerThreadNext) * sc->
registers_per_thread +
id % logicalRegistersPerThreadNext;
10405 sprintf(tempNum,
"%" PRIu64
"", t * logicalGroupSizeNext);
10445 sprintf(sc->
regIDs[i],
"%s", tempID[i]);
10463 tempID[i] = (
char*)malloc(
sizeof(
char) * 50);
10465 for (uint64_t j = 0; j < i; j++) {
10475 for (uint64_t j = 0; j < logicalRegistersPerThread / stageRadix; j++) {
10476 for (uint64_t i = 0; i < stageRadix; i++) {
10477 uint64_t
id = j + k * logicalRegistersPerThread / stageRadix + i * logicalStoragePerThread / stageRadix;
10478 id = (
id / logicalRegistersPerThread) * sc->
registers_per_thread +
id % logicalRegistersPerThread;
10487 sprintf(sc->
regIDs[i],
"%s", tempID[i]);
10512 for (uint64_t i = 0; i < logicalRegistersPerThread; i++) {
10513 if (strcmp(stageNormalization,
"")) {
10534 switch (shuffleType) {
10535 case 0:
case 5:
case 6:
case 110:
case 120:
case 130:
case 140:
case 142:
case 144: {
10541 case 1:
case 2:
case 111:
case 121:
case 131:
case 141:
case 143:
case 145: {
10542 res =
appendRadixShuffleStrided(sc, floatType, uintType, stageSize, stageSizeSum, stageAngle, stageRadix, stageRadixNext);
10553 switch (shuffleType) {
10554 case 0:
case 5:
case 6:
case 110:
case 120:
case 130:
case 140:
case 142:
case 144: {
10555 uint64_t logicalStoragePerThread;
10562 uint64_t logicalGroupSize = sc->
fftDim / logicalStoragePerThread;
10578 for (uint64_t i = 0; i < logicalStoragePerThread / sc->
registerBoost; i++) {
10612 for (uint64_t i = 0; i < logicalStoragePerThread / sc->
registerBoost; i++) {
10639 case 1:
case 2:
case 111:
case 121:
case 131:
case 141:
case 143:
case 145: {
10640 uint64_t logicalStoragePerThread;
10647 uint64_t logicalGroupSize = sc->
fftDim / logicalStoragePerThread;
10663 for (uint64_t i = 0; i < logicalStoragePerThread / sc->
registerBoost; i++) {
10697 for (uint64_t i = 0; i < logicalStoragePerThread / sc->
registerBoost; i++) {
10731 switch (readType) {
10732 case 0:
case 5:
case 6:
case 110:
case 120:
case 130:
case 140:
case 142:
case 144:
10755 switch (coordinate) {\n\
10775 case %" PRIu64
":\n", i);
10803 case 1:
case 111:
case 121:
case 131:
case 141:
case 143:
case 145:
10826 switch (coordinate) {\n\
10846 case %" PRIu64
":\n", i);
10881 switch (readType) {
10882 case 0:
case 5:
case 6:
case 110:
case 120:
case 130:
case 140:
case 142:
case 144:
10905 switch (coordinate) {\n\
10925 case %" PRIu64
":\n", i);
10953 case 1:
case 111:
case 121:
case 131:
case 141:
case 143:
case 145:
10976 switch (coordinate) {\n\
10996 case %" PRIu64
":\n", i);
11031#if(VKFFT_BACKEND==0)
11032 if (!strcmp(floatType,
"float")) sprintf(vecType,
"vec2");
11033 if (!strcmp(floatType,
"double")) sprintf(vecType,
"dvec2");
11034#elif(VKFFT_BACKEND==1)
11035 if (!strcmp(floatType,
"float")) sprintf(vecType,
"float2");
11036 if (!strcmp(floatType,
"double")) sprintf(vecType,
"double2");
11037#elif(VKFFT_BACKEND==2)
11038 if (!strcmp(floatType,
"float")) sprintf(vecType,
"float2");
11039 if (!strcmp(floatType,
"double")) sprintf(vecType,
"double2");
11040#elif(VKFFT_BACKEND==3)
11041 if (!strcmp(floatType,
"float")) sprintf(vecType,
"float2");
11042 if (!strcmp(floatType,
"double")) sprintf(vecType,
"double2");
11044 char separateRegisterStore[100] =
"_store";
11047 sc->
tempLen = sprintf(sc->
tempStr,
" %s %s%s;\n", vecType, sc->
regIDs[i], separateRegisterStore);
11051 sc->
tempLen = sprintf(sc->
tempStr,
" %s %s_%" PRIu64
"%s;\n", vecType, sc->
regIDs[i], j, separateRegisterStore);
11062 sc->
tempLen = sprintf(sc->
tempStr,
" %s_%" PRIu64
"%s=%s_%" PRIu64
";\n", sc->
regIDs[i], j, separateRegisterStore, sc->
regIDs[i], j);
11067 sc->
tempLen = sprintf(sc->
tempStr,
" for (%s batchID=0; batchID < %" PRIu64
"; batchID++){\n", uintType, sc->
numKernels);
11074 char shiftX[500] =
"";
11077 char requestCoordinate[100] =
"";
11080 sprintf(requestCoordinate,
"0");
11083 char index_x[2000] =
"";
11084 char index_y[2000] =
"";
11085 char requestBatch[100] =
"";
11086 char separateRegisterStore[100] =
"";
11089 sprintf(requestBatch,
"batchID");
11090 sprintf(separateRegisterStore,
"_store");
11098 sc->
tempLen = sprintf(sc->
tempStr,
" %s temp_real%" PRIu64
" = 0;\n", floatType, j);
11101 sc->
tempLen = sprintf(sc->
tempStr,
" %s temp_imag%" PRIu64
" = 0;\n", floatType, j);
11106 switch (dataType) {
11107 case 0:
case 5:
case 6:
case 110:
case 120:
case 130:
case 140:
case 142:
case 144:
11115 sc->
tempLen = sprintf(sc->
tempStr,
" %s = %s+%" PRIu64
"+%s * %" PRIu64
" + (((%s%s) %% %" PRIu64
") * %" PRIu64
" + ((%s%s) / %" PRIu64
") * %" PRIu64
");", sc->
inoutID, sc->
gl_LocalInvocationID_x, i * sc->
localSize[0], sc->
gl_LocalInvocationID_y, sc->
firstStageStartSize, sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
fftDim, sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
localSize[1] * sc->
firstStageStartSize);
11122 case 1:
case 111:
case 121:
case 131:
case 141:
case 143:
case 145:
11130 sc->
tempLen = sprintf(sc->
tempStr,
" %s = (%" PRIu64
" * (%s + %" PRIu64
") + ((%s%s) / %" PRIu64
") %% (%" PRIu64
")+((%s%s) / %" PRIu64
") * (%" PRIu64
"));\n", sc->
inoutID, sc->
stageStartSize, sc->
gl_LocalInvocationID_y, (i)*sc->
localSize[1], sc->
gl_GlobalInvocationID_x, shiftX, sc->
fft_dim_x, sc->
stageStartSize, sc->
gl_GlobalInvocationID_x, shiftX, sc->
fft_dim_x * sc->
stageStartSize, sc->
fftDim * sc->
stageStartSize);
11137 char kernelName[100] =
"";
11138 sprintf(kernelName,
"BluesteinConvolutionKernel");
11140 sc->
tempLen = sprintf(sc->
tempStr,
" temp_real0 = %s[inoutID].x * %s%s.x + %s[inoutID].y * %s%s.y;\n", kernelName, sc->
regIDs[i], separateRegisterStore, kernelName, sc->
regIDs[i], separateRegisterStore);
11142 sc->
tempLen = sprintf(sc->
tempStr,
" temp_real0 = %s[inoutID].x * %s%s.x - %s[inoutID].y * %s%s.y;\n", kernelName, sc->
regIDs[i], separateRegisterStore, kernelName, sc->
regIDs[i], separateRegisterStore);
11148 sc->
tempLen = sprintf(sc->
tempStr,
" temp_imag0 = %s[inoutID].x * %s%s.y - %s[inoutID].y * %s%s.x;\n", kernelName, sc->
regIDs[i], separateRegisterStore, kernelName, sc->
regIDs[i], separateRegisterStore);
11150 sc->
tempLen = sprintf(sc->
tempStr,
" temp_imag0 = %s[inoutID].x * %s%s.y + %s[inoutID].y * %s%s.x;\n", kernelName, sc->
regIDs[i], separateRegisterStore, kernelName, sc->
regIDs[i], separateRegisterStore);
11169 char shiftX[500] =
"";
11172 char requestCoordinate[100] =
"";
11175 sprintf(requestCoordinate,
"0");
11178 char index_x[2000] =
"";
11179 char index_y[2000] =
"";
11180 char requestBatch[100] =
"";
11181 char separateRegisterStore[100] =
"";
11184 sprintf(requestBatch,
"batchID");
11185 sprintf(separateRegisterStore,
"_store");
11193 sc->
tempLen = sprintf(sc->
tempStr,
" %s temp_real%" PRIu64
" = 0;\n", floatType, j);
11196 sc->
tempLen = sprintf(sc->
tempStr,
" %s temp_imag%" PRIu64
" = 0;\n", floatType, j);
11203 sc->
tempLen = sprintf(sc->
tempStr,
" temp_real%" PRIu64
" = 0;\n", j);
11206 sc->
tempLen = sprintf(sc->
tempStr,
" temp_imag%" PRIu64
" = 0;\n", j);
11211 switch (dataType) {
11226 sprintf(index_x,
"(combinedID %% %" PRIu64
") * %" PRIu64
" + (combinedID / %" PRIu64
") * %" PRIu64
"", sc->
fftDim, sc->
inputStride[0], sc->
fftDim, sc->
inputStride[1]);
11231 res =
indexInputVkFFT(sc, uintType, dataType, index_x, 0, requestCoordinate, requestBatch);
11244 sprintf(index_x,
"(combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * %" PRIu64
"", sc->
fftDim, sc->
fftDim, sc->
inputStride[1]);
11249 res =
indexInputVkFFT(sc, uintType, dataType, index_x, 0, requestCoordinate, requestBatch);
11263 sprintf(index_x,
"%s+%" PRIu64
"+%s * %" PRIu64
" + (((%s%s) %% %" PRIu64
") * %" PRIu64
" + ((%s%s) / %" PRIu64
") * %" PRIu64
")", sc->
gl_LocalInvocationID_x, i * sc->
localSize[0], sc->
gl_LocalInvocationID_y, sc->
firstStageStartSize, sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
fftDim, sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
localSize[1] * sc->
firstStageStartSize);
11268 res =
indexInputVkFFT(sc, uintType, dataType, index_x, 0, requestCoordinate, requestBatch);
11285 sprintf(index_y,
"(%s+%" PRIu64
")+((%s%s)/%" PRIu64
")%%(%" PRIu64
")+((%s%s)/%" PRIu64
")*(%" PRIu64
")", sc->
gl_LocalInvocationID_y, i * sc->
localSize[1], sc->
gl_GlobalInvocationID_x, shiftX, sc->
fft_dim_x, sc->
stageStartSize, sc->
gl_GlobalInvocationID_x, shiftX, sc->
fft_dim_x * sc->
stageStartSize, sc->
fftDim);
11290 res =
indexInputVkFFT(sc, uintType, dataType, index_x, index_y, requestCoordinate, requestBatch);
11301 char kernelName[100] =
"";
11302 sprintf(kernelName,
"kernel_obj");
11315 sc->
tempLen = sprintf(sc->
tempStr,
" temp_real%" PRIu64
" += %s[inoutID+%" PRIu64
"].x * %s%s.x - %s[inoutID+%" PRIu64
"].y * %s%s.y;\n", j, kernelName, k * sc->
inputStride[3], sc->
regIDs[i], separateRegisterStore, kernelName, k * sc->
inputStride[3], sc->
regIDs[i], separateRegisterStore);
11317 sc->
tempLen = sprintf(sc->
tempStr,
" temp_real%" PRIu64
" += %s[inoutID+%" PRIu64
"].x * %s_%" PRIu64
"%s.x - %s[inoutID+%" PRIu64
"].y * %s_%" PRIu64
"%s.y;\n", j, kernelName, k * sc->
inputStride[3], sc->
regIDs[i], l, separateRegisterStore, kernelName, k * sc->
inputStride[3], sc->
regIDs[i], l, separateRegisterStore);
11321 sc->
tempLen = sprintf(sc->
tempStr,
" temp_real%" PRIu64
" += %s[inoutID+%" PRIu64
"].x * %s%s.x + %s[inoutID+%" PRIu64
"].y * %s%s.y;\n", j, kernelName, k * sc->
inputStride[3], sc->
regIDs[i], separateRegisterStore, kernelName, k * sc->
inputStride[3], sc->
regIDs[i], separateRegisterStore);
11323 sc->
tempLen = sprintf(sc->
tempStr,
" temp_real%" PRIu64
" += %s[inoutID+%" PRIu64
"].x * %s_%" PRIu64
"%s.x + %s[inoutID+%" PRIu64
"].y * %s_%" PRIu64
"%s.y;\n", j, kernelName, k * sc->
inputStride[3], sc->
regIDs[i], l, separateRegisterStore, kernelName, k * sc->
inputStride[3], sc->
regIDs[i], l, separateRegisterStore);
11338 sc->
tempLen = sprintf(sc->
tempStr,
" temp_imag%" PRIu64
" += %s[inoutID+%" PRIu64
"].x * %s%s.y + %s[inoutID+%" PRIu64
"].y * %s%s.x;\n", j, kernelName, k * sc->
inputStride[3], sc->
regIDs[i], separateRegisterStore, kernelName, k * sc->
inputStride[3], sc->
regIDs[i], separateRegisterStore);
11340 sc->
tempLen = sprintf(sc->
tempStr,
" temp_imag%" PRIu64
" += %s[inoutID+%" PRIu64
"].x * %s_%" PRIu64
"%s.y + %s[inoutID+%" PRIu64
"].y * %s_%" PRIu64
"%s.x;\n", j, kernelName, k * sc->
inputStride[3], sc->
regIDs[i], l, separateRegisterStore, kernelName, k * sc->
inputStride[3], sc->
regIDs[i], l, separateRegisterStore);
11345 sc->
tempLen = sprintf(sc->
tempStr,
" temp_imag%" PRIu64
" += %s[inoutID+%" PRIu64
"].y * %s%s.x - %s[inoutID+%" PRIu64
"].x * %s%s.y ;\n", j, kernelName, k * sc->
inputStride[3], sc->
regIDs[i], separateRegisterStore, kernelName, k * sc->
inputStride[3], sc->
regIDs[i], separateRegisterStore);
11347 sc->
tempLen = sprintf(sc->
tempStr,
" temp_imag%" PRIu64
" += %s[inoutID+%" PRIu64
"].y * %s_%" PRIu64
"%s.x - %s[inoutID+%" PRIu64
"].x * %s_%" PRIu64
"%s.y;\n", j, kernelName, k * sc->
inputStride[3], sc->
regIDs[i], l, separateRegisterStore, kernelName, k * sc->
inputStride[3], sc->
regIDs[i], l, separateRegisterStore);
11351 sc->
tempLen = sprintf(sc->
tempStr,
" temp_imag%" PRIu64
" += %s[inoutID+%" PRIu64
"].x * %s%s.y - %s[inoutID+%" PRIu64
"].y * %s%s.x;\n", j, kernelName, k * sc->
inputStride[3], sc->
regIDs[i], separateRegisterStore, kernelName, k * sc->
inputStride[3], sc->
regIDs[i], separateRegisterStore);
11353 sc->
tempLen = sprintf(sc->
tempStr,
" temp_imag%" PRIu64
" += %s[inoutID+%" PRIu64
"].x * %s_%" PRIu64
"%s.y - %s[inoutID+%" PRIu64
"].y * %s_%" PRIu64
"%s.x;\n", j, kernelName, k * sc->
inputStride[3], sc->
regIDs[i], l, separateRegisterStore, kernelName, k * sc->
inputStride[3], sc->
regIDs[i], l, separateRegisterStore);
11362#if(VKFFT_BACKEND==0)
11363 sc->
tempLen = sprintf(sc->
tempStr,
" w.x = inversesqrt(temp_real0*temp_real0+temp_imag0*temp_imag0);\n");
11364#elif(VKFFT_BACKEND==1)
11365 sc->
tempLen = sprintf(sc->
tempStr,
" w.x = rsqrt(temp_real0*temp_real0+temp_imag0*temp_imag0);\n");
11366#elif(VKFFT_BACKEND==2)
11367 sc->
tempLen = sprintf(sc->
tempStr,
" w.x = rsqrt(temp_real0*temp_real0+temp_imag0*temp_imag0);\n");
11368#elif(VKFFT_BACKEND==3)
11369 sc->
tempLen = sprintf(sc->
tempStr,
" w.x = rsqrt(temp_real0*temp_real0+temp_imag0*temp_imag0);\n");
11390#if(VKFFT_BACKEND==0)
11391 sc->
tempLen = sprintf(sc->
tempStr,
" w.x = inversesqrt(temp_real%" PRIu64
"*temp_real%" PRIu64
"+temp_imag%" PRIu64
"*temp_imag%" PRIu64
");\n", l, l, l, l);
11392#elif(VKFFT_BACKEND==1)
11393 sc->
tempLen = sprintf(sc->
tempStr,
" w.x = rsqrt(temp_real%" PRIu64
"*temp_real%" PRIu64
"+temp_imag%" PRIu64
"*temp_imag%" PRIu64
");\n", l, l, l, l);
11394#elif(VKFFT_BACKEND==2)
11395 sc->
tempLen = sprintf(sc->
tempStr,
" w.x = rsqrt(temp_real%" PRIu64
"*temp_real%" PRIu64
"+temp_imag%" PRIu64
"*temp_imag%" PRIu64
");\n", l, l, l, l);
11396#elif(VKFFT_BACKEND==3)
11397 sc->
tempLen = sprintf(sc->
tempStr,
" w.x = rsqrt(temp_real%" PRIu64
"*temp_real%" PRIu64
"+temp_imag%" PRIu64
"*temp_imag%" PRIu64
");\n", l, l, l, l);
11401 sc->
tempLen = sprintf(sc->
tempStr,
" %s_%" PRIu64
".x = temp_real%" PRIu64
" * w.x;\n", sc->
regIDs[i], l, l);
11404 sc->
tempLen = sprintf(sc->
tempStr,
" %s_%" PRIu64
".y = temp_imag%" PRIu64
" * w.x;\n", sc->
regIDs[i], l, l);
11409 sc->
tempLen = sprintf(sc->
tempStr,
" %s_%" PRIu64
".x = temp_real%" PRIu64
";\n", sc->
regIDs[i], l, l);
11412 sc->
tempLen = sprintf(sc->
tempStr,
" %s_%" PRIu64
".y = temp_imag%" PRIu64
";\n", sc->
regIDs[i], l, l);
11421 sc->
tempLen = sprintf(sc->
tempStr,
" %s temp_real%" PRIu64
" = 0;\n", floatType, j);
11433 sc->
tempLen = sprintf(sc->
tempStr,
" temp_real%" PRIu64
" += kernelBlocks[(inoutID+%" PRIu64
")/%" PRIu64
"].%s[(inoutID+%" PRIu64
") %% %" PRIu64
"].x * %s%s.x - kernelBlocks[(inoutID+%" PRIu64
")/%" PRIu64
"].%s[(inoutID+%" PRIu64
") %% %" PRIu64
"].y * %s%s.y;\n", j, k * sc->
inputStride[3], sc->
kernelBlockSize, kernelName, k * sc->
inputStride[3], sc->
kernelBlockSize, sc->
regIDs[i], separateRegisterStore, k * sc->
inputStride[3], sc->
kernelBlockSize, kernelName, k * sc->
inputStride[3], sc->
kernelBlockSize, sc->
regIDs[i], separateRegisterStore);
11435 sc->
tempLen = sprintf(sc->
tempStr,
" temp_real%" PRIu64
" += kernelBlocks[(inoutID+%" PRIu64
")/%" PRIu64
"].%s[(inoutID+%" PRIu64
") %% %" PRIu64
"].x * %s_%" PRIu64
"%s.x - kernelBlocks[(inoutID+%" PRIu64
")/%" PRIu64
"].%s[(inoutID+%" PRIu64
") %% %" PRIu64
"].y * %s_%" PRIu64
"%s.y;\n", j, k * sc->
inputStride[3], sc->
kernelBlockSize, kernelName, k * sc->
inputStride[3], sc->
kernelBlockSize, sc->
regIDs[i], l, separateRegisterStore, k * sc->
inputStride[3], sc->
kernelBlockSize, kernelName, k * sc->
inputStride[3], sc->
kernelBlockSize, sc->
regIDs[i], l, separateRegisterStore);
11441 sc->
tempLen = sprintf(sc->
tempStr,
" %s temp_imag%" PRIu64
" = 0;\n", floatType, j);
11453 sc->
tempLen = sprintf(sc->
tempStr,
" temp_imag%" PRIu64
" += kernelBlocks[(inoutID+%" PRIu64
")/%" PRIu64
"].%s[(inoutID+%" PRIu64
") %% %" PRIu64
"].x * %s%s.y + kernelBlocks[(inoutID+%" PRIu64
")/%" PRIu64
"].%s[(inoutID+%" PRIu64
") %% %" PRIu64
"].y * %s%s.x;\n", j, k * sc->
inputStride[3], sc->
kernelBlockSize, kernelName, k * sc->
inputStride[3], sc->
kernelBlockSize, sc->
regIDs[i], separateRegisterStore, k * sc->
inputStride[3], sc->
kernelBlockSize, kernelName, k * sc->
inputStride[3], sc->
kernelBlockSize, sc->
regIDs[i], separateRegisterStore);
11455 sc->
tempLen = sprintf(sc->
tempStr,
" temp_imag%" PRIu64
" += kernelBlocks[(inoutID+%" PRIu64
")/%" PRIu64
"].%s[(inoutID+%" PRIu64
") %% %" PRIu64
"].x * %s_%" PRIu64
"%s.y + kernelBlocks[(inoutID+%" PRIu64
")/%" PRIu64
"].%s[(inoutID+%" PRIu64
") %% %" PRIu64
"].y * %s_%" PRIu64
"%s.x;\n", j, k * sc->
inputStride[3], sc->
kernelBlockSize, kernelName, k * sc->
inputStride[3], sc->
kernelBlockSize, sc->
regIDs[i], l, separateRegisterStore, k * sc->
inputStride[3], sc->
kernelBlockSize, kernelName, k * sc->
inputStride[3], sc->
kernelBlockSize, sc->
regIDs[i], l, separateRegisterStore);
11467 sc->
tempLen = sprintf(sc->
tempStr,
" %s_%" PRIu64
".x = temp_real%" PRIu64
";\n", sc->
regIDs[i], l, l);
11470 sc->
tempLen = sprintf(sc->
tempStr,
" %s_%" PRIu64
".y = temp_imag%" PRIu64
";\n", sc->
regIDs[i], l, l);
11484 switch (writeType) {
11528 case 110:
case 111:
case 120:
case 121:
case 130:
case 131:
case 140:
case 141:
case 142:
case 143:
case 144:
case 145:
11538 double double_PI = 3.1415926535897932384626433832795;
11540 char outputsStruct[20] =
"";
11541 char LFending[4] =
"";
11542 if (!strcmp(floatType,
"float")) sprintf(LFending,
"f");
11543#if(VKFFT_BACKEND==0)
11544 if (!strcmp(floatType,
"float")) sprintf(vecType,
"vec2");
11545 if (!strcmp(floatType,
"double")) sprintf(vecType,
"dvec2");
11547 sprintf(outputsStruct,
"outputs");
11549 sprintf(outputsStruct,
".outputs");
11550 if (!strcmp(floatType,
"double")) sprintf(LFending,
"LF");
11551 char cosDef[20] =
"cos";
11552 char sinDef[20] =
"sin";
11553#elif(VKFFT_BACKEND==1)
11554 if (!strcmp(floatType,
"float")) sprintf(vecType,
"float2");
11555 if (!strcmp(floatType,
"double")) sprintf(vecType,
"double2");
11556 sprintf(outputsStruct,
"outputs");
11557 if (!strcmp(floatType,
"double")) sprintf(LFending,
"l");
11558 char cosDef[20] =
"__cosf";
11559 char sinDef[20] =
"__sinf";
11560#elif(VKFFT_BACKEND==2)
11561 if (!strcmp(floatType,
"float")) sprintf(vecType,
"float2");
11562 if (!strcmp(floatType,
"double")) sprintf(vecType,
"double2");
11563 sprintf(outputsStruct,
"outputs");
11564 if (!strcmp(floatType,
"double")) sprintf(LFending,
"l");
11565 char cosDef[20] =
"__cosf";
11566 char sinDef[20] =
"__sinf";
11567#elif(VKFFT_BACKEND==3)
11568 if (!strcmp(floatType,
"float")) sprintf(vecType,
"float2");
11569 if (!strcmp(floatType,
"double")) sprintf(vecType,
"double2");
11570 sprintf(outputsStruct,
"outputs");
11572 char cosDef[20] =
"native_cos";
11573 char sinDef[20] =
"native_sin";
11575 char convTypeLeft[20] =
"";
11576 char convTypeRight[20] =
"";
11577 if ((!strcmp(floatTypeMemory,
"half")) && (strcmp(floatType,
"half"))) {
11578 if ((writeType == 6) || (writeType == 110) || (writeType == 111) || (writeType == 120) || (writeType == 121) || (writeType == 130) || (writeType == 131) || (writeType == 140) || (writeType == 141) || (writeType == 142) || (writeType == 143) || (writeType == 144) || (writeType == 145)) {
11579 sprintf(convTypeLeft,
"float16_t(");
11580 sprintf(convTypeRight,
")");
11583 sprintf(convTypeLeft,
"f16vec2(");
11584 sprintf(convTypeRight,
")");
11587 if ((!strcmp(floatTypeMemory,
"float")) && (strcmp(floatType,
"float"))) {
11588 if ((writeType == 6) || (writeType == 110) || (writeType == 111) || (writeType == 120) || (writeType == 121) || (writeType == 130) || (writeType == 131) || (writeType == 140) || (writeType == 141) || (writeType == 142) || (writeType == 143) || (writeType == 144) || (writeType == 145)) {
11589#if(VKFFT_BACKEND==0)
11590 sprintf(convTypeLeft,
"float(");
11591 sprintf(convTypeRight,
")");
11592#elif(VKFFT_BACKEND==1)
11593 sprintf(convTypeLeft,
"(float)");
11595#elif(VKFFT_BACKEND==2)
11596 sprintf(convTypeLeft,
"(float)");
11598#elif(VKFFT_BACKEND==3)
11599 sprintf(convTypeLeft,
"(float)");
11604#if(VKFFT_BACKEND==0)
11605 sprintf(convTypeLeft,
"vec2(");
11606 sprintf(convTypeRight,
")");
11607#elif(VKFFT_BACKEND==1)
11608 sprintf(convTypeLeft,
"conv_float2(");
11609 sprintf(convTypeRight,
")");
11610#elif(VKFFT_BACKEND==2)
11611 sprintf(convTypeLeft,
"conv_float2(");
11612 sprintf(convTypeRight,
")");
11613#elif(VKFFT_BACKEND==3)
11614 sprintf(convTypeLeft,
"conv_float2(");
11615 sprintf(convTypeRight,
")");
11619 if ((!strcmp(floatTypeMemory,
"double")) && (strcmp(floatType,
"double"))) {
11620 if ((writeType == 6) || (writeType == 110) || (writeType == 111) || (writeType == 120) || (writeType == 121) || (writeType == 130) || (writeType == 131) || (writeType == 140) || (writeType == 141) || (writeType == 142) || (writeType == 143) || (writeType == 144) || (writeType == 145)) {
11621#if(VKFFT_BACKEND==0)
11622 sprintf(convTypeLeft,
"double(");
11623 sprintf(convTypeRight,
")");
11624#elif(VKFFT_BACKEND==1)
11625 sprintf(convTypeLeft,
"(double)");
11627#elif(VKFFT_BACKEND==2)
11628 sprintf(convTypeLeft,
"(double)");
11630#elif(VKFFT_BACKEND==3)
11631 sprintf(convTypeLeft,
"(double)");
11636#if(VKFFT_BACKEND==0)
11637 sprintf(convTypeLeft,
"dvec2(");
11638 sprintf(convTypeRight,
")");
11639#elif(VKFFT_BACKEND==1)
11640 sprintf(convTypeLeft,
"conv_double2(");
11641 sprintf(convTypeRight,
")");
11642#elif(VKFFT_BACKEND==2)
11643 sprintf(convTypeLeft,
"conv_double2(");
11644 sprintf(convTypeRight,
")");
11645#elif(VKFFT_BACKEND==3)
11646 sprintf(convTypeLeft,
"conv_double2(");
11647 sprintf(convTypeRight,
")");
11652 char index_x[2000] =
"";
11653 char index_y[2000] =
"";
11654 char requestCoordinate[100] =
"";
11657 sprintf(requestCoordinate,
"coordinate");
11660 char requestBatch[100] =
"";
11663 sprintf(requestBatch,
"batchID");
11666 switch (writeType) {
11675 char shiftX[500] =
"";
11677 sprintf(shiftX,
" + consts.workGroupShiftX ");
11678 char shiftY[500] =
"";
11688 char shiftY2[100] =
"";
11690 sprintf(shiftY,
" + consts.workGroupShiftY ");
11699 sc->
tempLen = sprintf(sc->
tempStr,
" if (((%s + %" PRIu64
" * %s) %% %" PRIu64
" + ((%s%s) / %" PRIu64
")*%" PRIu64
" < %" PRIu64
")){\n", sc->
gl_LocalInvocationID_x, sc->
localSize[0], sc->
gl_LocalInvocationID_y, sc->
localSize[0], sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
localSize[0], sc->
fft_dim_full / sc->
firstStageStartSize);
11709 sc->
tempLen = sprintf(sc->
tempStr,
" if (((%s + %" PRIu64
" * %s) %% %" PRIu64
" + ((%s%s) / %" PRIu64
")*%" PRIu64
" < %" PRIu64
")){\n", sc->
gl_LocalInvocationID_x, sc->
localSize[0], sc->
gl_LocalInvocationID_y, sc->
localSize[1], sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
localSize[1], sc->
fft_dim_full / sc->
firstStageStartSize);
11783 sc->
tempLen = sprintf(sc->
tempStr,
" %s[%s] = %ssdata[(combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
")]%s;\n", outputsStruct, sc->
inoutID, convTypeLeft, sc->
fftDim, sc->
fftDim, convTypeRight);
11785 sc->
tempLen = sprintf(sc->
tempStr,
" outputBlocks[%s / %" PRIu64
"]%s[%s %% %" PRIu64
"] = %ssdata[(combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
")]%s;\n", sc->
inoutID, sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputBufferBlockSize, convTypeLeft, sc->
fftDim, sc->
fftDim, convTypeRight);
11791 sc->
tempLen = sprintf(sc->
tempStr,
" %s[%s] = %ssdata[(combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride]%s;\n", outputsStruct, sc->
inoutID, convTypeLeft, sc->
fftDim, sc->
fftDim, convTypeRight);
11793 sc->
tempLen = sprintf(sc->
tempStr,
" outputBlocks[%s / %" PRIu64
"]%s[%s %% %" PRIu64
"] = %ssdata[(combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride]%s;\n", sc->
inoutID, sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputBufferBlockSize, convTypeLeft, sc->
fftDim, sc->
fftDim, convTypeRight);
11837 sc->
tempLen = sprintf(sc->
tempStr,
" inoutID = combinedID %% %" PRIu64
" + ((%s%s) / %" PRIu64
")*%" PRIu64
" + ((combinedID/%" PRIu64
") * %" PRIu64
")+ ((%s%s) %% %" PRIu64
") * %" PRIu64
";\n", sc->
localSize[0], sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
localSize[0], sc->
localSize[0], sc->
fft_dim_full / sc->
fftDim, sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
fft_dim_full / sc->
firstStageStartSize);
11845 sc->
tempLen = sprintf(sc->
tempStr,
" inoutID = combinedID %% %" PRIu64
" + ((%s%s) / %" PRIu64
")*%" PRIu64
" + ((combinedID/%" PRIu64
") * %" PRIu64
")+ ((%s%s) %% %" PRIu64
") * %" PRIu64
";\n", sc->
localSize[1], sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
localSize[1], sc->
localSize[1], sc->
fft_dim_full / sc->
fftDim, sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
fft_dim_full / sc->
firstStageStartSize);
11877 sc->
tempLen = sprintf(sc->
tempStr,
" outputBlocks[%s / %" PRIu64
"]%s[%s %% %" PRIu64
"] = %ssdata[(combinedID %% %s)+(combinedID/%s)*sharedStride]%s;\n", sc->
inoutID, sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputBufferBlockSize, convTypeLeft, sc->
gl_WorkGroupSize_x, sc->
gl_WorkGroupSize_x, convTypeRight);
11885 sc->
tempLen = sprintf(sc->
tempStr,
" outputBlocks[%s / %" PRIu64
"]%s[%s %% %" PRIu64
"] = %ssdata[(combinedID %% %s)*sharedStride+combinedID/%s]%s;\n", sc->
inoutID, sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputBufferBlockSize, convTypeLeft, sc->
gl_WorkGroupSize_y, sc->
gl_WorkGroupSize_y, convTypeRight);
11975 sc->
tempLen = sprintf(sc->
tempStr,
" %s[%s] = %ssdata[(combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
")]%s;\n", outputsStruct, sc->
inoutID, convTypeLeft, sc->
fftDim, sc->
fftDim, convTypeRight);
11977 sc->
tempLen = sprintf(sc->
tempStr,
" outputBlocks[%s / %" PRIu64
"]%s[%s %% %" PRIu64
"] = %ssdata[(combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
")]%s;\n", sc->
inoutID, sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputBufferBlockSize, convTypeLeft, sc->
fftDim, sc->
fftDim, convTypeRight);
11983 sc->
tempLen = sprintf(sc->
tempStr,
" %s[%s] = %ssdata[(combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride]%s;\n", outputsStruct, sc->
inoutID, convTypeLeft, sc->
fftDim, sc->
fftDim, convTypeRight);
11985 sc->
tempLen = sprintf(sc->
tempStr,
" outputBlocks[%s / %" PRIu64
"]%s[%s %% %" PRIu64
"] = %ssdata[(combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride]%s;\n", sc->
inoutID, sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputBufferBlockSize, convTypeLeft, sc->
fftDim, sc->
fftDim, convTypeRight);
12033 sc->
tempLen = sprintf(sc->
tempStr,
" inoutID = (combinedID %% %" PRIu64
")+(combinedID / %" PRIu64
") * %" PRIu64
" + (((%s%s) %% %" PRIu64
") * %" PRIu64
" + ((%s%s) / %" PRIu64
") * %" PRIu64
");", sc->
fftDim, sc->
fftDim, sc->
firstStageStartSize, sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
fftDim, sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
localSize[0] * sc->
firstStageStartSize);
12038 sc->
tempLen = sprintf(sc->
tempStr,
" inoutID = %s+%" PRIu64
"+%s * %" PRIu64
" + (((%s%s) %% %" PRIu64
") * %" PRIu64
" + ((%s%s) / %" PRIu64
") * %" PRIu64
");", sc->
gl_LocalInvocationID_x, (i + k * sc->
min_registers_per_thread) * sc->
localSize[0], sc->
gl_LocalInvocationID_y, sc->
firstStageStartSize, sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
fftDim, sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
localSize[1] * sc->
firstStageStartSize);
12108 char shiftX[500] =
"";
12111 sc->
tempLen = sprintf(sc->
tempStr,
" if (((%s%s) / %" PRIu64
") %% (%" PRIu64
")+((%s%s) / %" PRIu64
") * (%" PRIu64
") < %" PRIu64
") {\n", sc->
gl_GlobalInvocationID_x, shiftX, sc->
fft_dim_x, sc->
stageStartSize, sc->
gl_GlobalInvocationID_x, shiftX, sc->
fft_dim_x * sc->
stageStartSize, sc->
fftDim * sc->
stageStartSize, sc->
size[sc->
axis_id]);
12117 sc->
tempLen = sprintf(sc->
tempStr,
" inoutID = (%s + %" PRIu64
") * (%" PRIu64
") + (((%s%s) / %" PRIu64
") %% (%" PRIu64
")) * (%" PRIu64
") + ((%s%s) / %" PRIu64
");\n", sc->
gl_LocalInvocationID_y, (i + k * sc->
min_registers_per_thread) * sc->
localSize[1], sc->
fft_dim_full / sc->
fftDim, sc->
gl_GlobalInvocationID_x, shiftX, sc->
fft_dim_x, sc->
firstStageStartSize / sc->
fftDim, sc->
fft_dim_full / sc->
firstStageStartSize, sc->
gl_GlobalInvocationID_x, shiftX, sc->
fft_dim_x * (sc->
firstStageStartSize / sc->
fftDim));
12149 sc->
tempLen = sprintf(sc->
tempStr,
" outputBlocks[%s / %" PRIu64
"]%s[%s %% %" PRIu64
"] = %ssdata[%s*(%s+%" PRIu64
") + %s]%s;\n", sc->
inoutID, sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputBufferBlockSize, convTypeLeft, sc->
sharedStride, sc->
gl_LocalInvocationID_y, (i + k * sc->
min_registers_per_thread) * sc->
localSize[1], sc->
gl_LocalInvocationID_x, convTypeRight);
12169 sc->
tempLen = sprintf(sc->
tempStr,
" inoutID = (%s + %" PRIu64
") * %" PRIu64
" + ((%s%s) / %" PRIu64
") %% (%" PRIu64
")+((%s%s) / %" PRIu64
") * (%" PRIu64
");\n", sc->
gl_LocalInvocationID_y, (i + k * sc->
min_registers_per_thread) * sc->
localSize[1], sc->
stageStartSize, sc->
gl_GlobalInvocationID_x, shiftX, sc->
fft_dim_x, sc->
stageStartSize, sc->
gl_GlobalInvocationID_x, shiftX, sc->
fft_dim_x * sc->
stageStartSize, sc->
stageStartSize * sc->
fftDim);
12178 sc->
tempLen = sprintf(sc->
tempStr,
" inoutID = (%s + %" PRIu64
") * %" PRIu64
" + ((%s%s) / %" PRIu64
") %% (%" PRIu64
")+((%s%s) / %" PRIu64
") * (%" PRIu64
");\n", sc->
gl_LocalInvocationID_y, (i + k * sc->
min_registers_per_thread) * sc->
localSize[1], sc->
stageStartSize, sc->
gl_GlobalInvocationID_x, shiftX, sc->
fft_dim_x, sc->
stageStartSize, sc->
gl_GlobalInvocationID_x, shiftX, sc->
fft_dim_x * sc->
stageStartSize, sc->
stageStartSize * sc->
fftDim);
12190 sprintf(index_y,
"%" PRIu64
" * (%s + %" PRIu64
") + ((%s%s) / %" PRIu64
") %% (%" PRIu64
")+((%s%s) / %" PRIu64
") * (%" PRIu64
")", sc->
stageStartSize, sc->
gl_LocalInvocationID_y, (i + k * sc->
min_registers_per_thread) * sc->
localSize[1], sc->
gl_GlobalInvocationID_x, shiftX, sc->
fft_dim_x, sc->
stageStartSize, sc->
gl_GlobalInvocationID_x, shiftX, sc->
fft_dim_x * sc->
stageStartSize, sc->
stageStartSize * sc->
fftDim);
12191 res =
indexOutputVkFFT(sc, uintType, writeType, index_x, index_y, requestCoordinate, requestBatch);
12211 sc->
tempLen = sprintf(sc->
tempStr,
" outputBlocks[inoutID / %" PRIu64
"]%s[inoutID %% %" PRIu64
"] = %ssdata[%s*(%s+%" PRIu64
") + %s]%s;\n", sc->
outputBufferBlockSize, outputsStruct, sc->
outputBufferBlockSize, convTypeLeft, sc->
sharedStride, sc->
gl_LocalInvocationID_y, (i + k * sc->
min_registers_per_thread) * sc->
localSize[1], sc->
gl_LocalInvocationID_x, convTypeRight);
12244 char shiftX[500] =
"";
12252 sc->
tempLen = sprintf(sc->
tempStr,
" inoutID = (%s%s) %% (%" PRIu64
") + %" PRIu64
" * (%s + %" PRIu64
") + ((%s%s) / %" PRIu64
") * (%" PRIu64
");\n", sc->
gl_GlobalInvocationID_x, shiftX, sc->
stageStartSize, sc->
stageStartSize, sc->
gl_LocalInvocationID_y, (i + k * sc->
min_registers_per_thread) * sc->
localSize[1], sc->
gl_GlobalInvocationID_x, shiftX, sc->
stageStartSize, sc->
stageStartSize * sc->
fftDim);
12287 sc->
tempLen = sprintf(sc->
tempStr,
" outputBlocks[inoutID / %" PRIu64
"]%s[inoutID %% %" PRIu64
"] = %ssdata[%s*(%s+%" PRIu64
") + %s]%s;\n", sc->
outputBufferBlockSize, outputsStruct, sc->
outputBufferBlockSize, convTypeLeft, sc->
sharedStride, sc->
gl_LocalInvocationID_y, (i + k * sc->
min_registers_per_thread) * sc->
localSize[1], sc->
gl_LocalInvocationID_x, convTypeRight);
12319 char shiftX[500] =
"";
12321 sprintf(shiftX,
" + consts.workGroupShiftX ");
12322 char shiftY[500] =
"";
12326 sprintf(shiftY,
" + consts.workGroupShiftY ");
12342 sdata[%s + %" PRIu64
"* sharedStride] = sdata[%s];\n\
12357 sdata[%s * sharedStride + %" PRIu64
"] = sdata[%s * sharedStride];\n\
12371 for (uint64_t i = 0; i < num_out; i++) {
12391 if ((uint64_t)ceil(sc->
size[1] / (
double)mult) % sc->
localSize[0] != 0) {
12403 if ((uint64_t)ceil(sc->
size[1] / (
double)mult) % sc->
localSize[1] != 0) {
12440 sc->
tempLen = sprintf(sc->
tempStr,
"if ( (combinedID / %" PRIu64
") %% 2 == 0){\n", sc->
fftDim / 2 + 1);
12443 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = 0.5%s*(sdata[(combinedID %% %" PRIu64
")* sharedStride + (combinedID / %" PRIu64
")].x+sdata[(%" PRIu64
"-combinedID %% %" PRIu64
")* sharedStride + (combinedID / %" PRIu64
")].x);\n", sc->
regIDs[0], LFending, sc->
fftDim / 2 + 1, 2 * (sc->
fftDim / 2 + 1), sc->
fftDim, sc->
fftDim / 2 + 1, 2 * (sc->
fftDim / 2 + 1));
12446 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = 0.5%s*(sdata[(combinedID %% %" PRIu64
")* sharedStride + (combinedID / %" PRIu64
")].y-sdata[(%" PRIu64
"-combinedID %% %" PRIu64
")* sharedStride + (combinedID / %" PRIu64
")].y);\n", sc->
regIDs[0], LFending, sc->
fftDim / 2 + 1, 2 * (sc->
fftDim / 2 + 1), sc->
fftDim, sc->
fftDim / 2 + 1, 2 * (sc->
fftDim / 2 + 1));
12452 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = 0.5%s*(sdata[(combinedID %% %" PRIu64
")* sharedStride + (combinedID / %" PRIu64
")].y+sdata[(%" PRIu64
"-combinedID %% %" PRIu64
")* sharedStride + (combinedID / %" PRIu64
")].y);\n", sc->
regIDs[0], LFending, sc->
fftDim / 2 + 1, 2 * (sc->
fftDim / 2 + 1), sc->
fftDim, sc->
fftDim / 2 + 1, 2 * (sc->
fftDim / 2 + 1));
12455 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = 0.5%s*(-sdata[(combinedID %% %" PRIu64
")* sharedStride + (combinedID / %" PRIu64
")].x+sdata[(%" PRIu64
"-combinedID %% %" PRIu64
")* sharedStride + (combinedID / %" PRIu64
")].x);\n", sc->
regIDs[0], LFending, sc->
fftDim / 2 + 1, 2 * (sc->
fftDim / 2 + 1), sc->
fftDim, sc->
fftDim / 2 + 1, 2 * (sc->
fftDim / 2 + 1));
12462 sc->
tempLen = sprintf(sc->
tempStr,
" %s[%s] = %s%s%s;\n", outputsStruct, sc->
inoutID, convTypeLeft, sc->
regIDs[0], convTypeRight);
12469 sc->
tempLen = sprintf(sc->
tempStr,
"if ( (combinedID / %" PRIu64
") %% 2 == 0){\n", sc->
fftDim / 2 + 1);
12472 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = 0.5%s*(sdata[(combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].x+sdata[(%" PRIu64
"-combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].x);\n", sc->
regIDs[0], LFending, sc->
fftDim / 2 + 1, 2 * (sc->
fftDim / 2 + 1), sc->
fftDim, sc->
fftDim / 2 + 1, 2 * (sc->
fftDim / 2 + 1));
12475 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = 0.5%s*(sdata[(combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].y-sdata[(%" PRIu64
"-combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].y);\n", sc->
regIDs[0], LFending, sc->
fftDim / 2 + 1, 2 * (sc->
fftDim / 2 + 1), sc->
fftDim, sc->
fftDim / 2 + 1, 2 * (sc->
fftDim / 2 + 1));
12481 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = 0.5%s*(sdata[(combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].y+sdata[(%" PRIu64
"-combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].y);\n", sc->
regIDs[0], LFending, sc->
fftDim / 2 + 1, 2 * (sc->
fftDim / 2 + 1), sc->
fftDim, sc->
fftDim / 2 + 1, 2 * (sc->
fftDim / 2 + 1));
12484 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = 0.5%s*(-sdata[(combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].x+sdata[(%" PRIu64
"-combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].x);\n", sc->
regIDs[0], LFending, sc->
fftDim / 2 + 1, 2 * (sc->
fftDim / 2 + 1), sc->
fftDim, sc->
fftDim / 2 + 1, 2 * (sc->
fftDim / 2 + 1));
12491 sc->
tempLen = sprintf(sc->
tempStr,
" %s[%s] = %s%s%s;\n", outputsStruct, sc->
inoutID, convTypeLeft, sc->
regIDs[0], convTypeRight);
12501 sc->
tempLen = sprintf(sc->
tempStr,
" %s[inoutID] = %ssdata[(combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride]%s;\n", outputsStruct, convTypeLeft, sc->
fftDim / 2 + 1, sc->
fftDim / 2 + 1, convTypeRight);
12509 sc->
tempLen = sprintf(sc->
tempStr,
" %s[inoutID] = %ssdata[(combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
")]%s;\n", outputsStruct, convTypeLeft, sc->
fftDim / 2 + 1, sc->
fftDim / 2 + 1, convTypeRight);
12540 if ((uint64_t)ceil(sc->
size[1] / (
double)mult) % sc->
localSize[0] != 0) {
12547 if ((uint64_t)ceil(sc->
size[1] / (
double)mult) % sc->
localSize[1] != 0) {
12580 char shiftY[500] =
"";
12582 sprintf(shiftY,
" + consts.workGroupShiftY * %" PRIu64
"", sc->
localSize[1]);
12613 if ((uint64_t)ceil(sc->
size[1] / (
double)mult) % sc->
localSize[0] != 0) {
12620 if ((uint64_t)ceil(sc->
size[1] / (
double)mult) % sc->
localSize[1] != 0) {
12649 sc->
tempLen = sprintf(sc->
tempStr,
" %s[%s] = %s%s.x%s;\n", outputsStruct, sc->
inoutID, convTypeLeft, sc->
regIDs[i], convTypeRight);
12660 sc->
tempLen = sprintf(sc->
tempStr,
" %s[%s] = %s%s.y%s;\n", outputsStruct, sc->
inoutID, convTypeLeft, sc->
regIDs[i], convTypeRight);
12670 sc->
tempLen = sprintf(sc->
tempStr,
" %s[%s] = %ssdata[(combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
")].x%s;\n", outputsStruct, sc->
inoutID, convTypeLeft, sc->
fftDim, sc->
fftDim, convTypeRight);
12672 sc->
tempLen = sprintf(sc->
tempStr,
" outputBlocks[%s / %" PRIu64
"]%s[%s %% %" PRIu64
"] = %ssdata[(combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
")].x%s;\n", sc->
inoutID, sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputBufferBlockSize, convTypeLeft, sc->
fftDim, sc->
fftDim, convTypeRight);
12680 sc->
tempLen = sprintf(sc->
tempStr,
" %s[%s] = %ssdata[(combinedID %% %" PRIu64
")* sharedStride + (combinedID / %" PRIu64
")].y%s;\n", outputsStruct, sc->
inoutID, convTypeLeft, sc->
fftDim, sc->
fftDim, convTypeRight);
12682 sc->
tempLen = sprintf(sc->
tempStr,
" outputBlocks[%s / %" PRIu64
"]%s[%s %% %" PRIu64
"] = %ssdata[(combinedID %% %" PRIu64
") * sharedStride+ (combinedID / %" PRIu64
")].y%s;\n", sc->
inoutID, sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputBufferBlockSize, convTypeLeft, sc->
fftDim, sc->
fftDim, convTypeRight);
12689 sc->
tempLen = sprintf(sc->
tempStr,
" %s[%s] = %ssdata[(combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].x%s;\n", outputsStruct, sc->
inoutID, convTypeLeft, sc->
fftDim, sc->
fftDim, convTypeRight);
12691 sc->
tempLen = sprintf(sc->
tempStr,
" outputBlocks[%s / %" PRIu64
"]%s[%s %% %" PRIu64
"] = %ssdata[(combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].x%s;\n", sc->
inoutID, sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputBufferBlockSize, convTypeLeft, sc->
fftDim, sc->
fftDim, convTypeRight);
12699 sc->
tempLen = sprintf(sc->
tempStr,
" %s[%s] = %ssdata[(combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].y%s;\n", outputsStruct, sc->
inoutID, convTypeLeft, sc->
fftDim, sc->
fftDim, convTypeRight);
12701 sc->
tempLen = sprintf(sc->
tempStr,
" outputBlocks[%s / %" PRIu64
"]%s[%s %% %" PRIu64
"] = %ssdata[(combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].y%s;\n", sc->
inoutID, sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputBufferBlockSize, convTypeLeft, sc->
fftDim, sc->
fftDim, convTypeRight);
12720 if ((uint64_t)ceil(sc->
size[1] / (
double)mult) % sc->
localSize[0] != 0) {
12727 if ((uint64_t)ceil(sc->
size[1] / (
double)mult) % sc->
localSize[1] != 0) {
12751 char shiftX[500] =
"";
12753 sprintf(shiftX,
" + consts.workGroupShiftX ");
12754 char shiftY[500] =
"";
12757 char shiftY2[500] =
"";
12759 sprintf(shiftY2,
" + consts.workGroupShiftY ");
12776 sdata[%s + %" PRIu64
"* sharedStride] = sdata[%s];\n\
12791 sdata[%s * sharedStride + %" PRIu64
"] = sdata[%s * sharedStride];\n\
12805 for (uint64_t i = 0; i < num_out; i++) {
12864 sc->
tempLen = sprintf(sc->
tempStr,
" %s = (sdata[(combinedID %% %" PRIu64
")* sharedStride + (combinedID / %" PRIu64
")]);\n", sc->
regIDs[0], sc->
fftDim, sc->
fftDim);
12868 sc->
tempLen = sprintf(sc->
tempStr,
" %s[%s] = %s(%s.x)%s;\n", outputsStruct, sc->
inoutID, convTypeLeft, sc->
regIDs[0], convTypeRight);
12877 sc->
tempLen = sprintf(sc->
tempStr,
" outputBlocks[(%s %" PRIu64
")/ %" PRIu64
"]%s[(%s+%" PRIu64
") %% %" PRIu64
"] = %s(%s.y)%s;\n", sc->
inoutID, sc->
outputStride[1], sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputStride[1], sc->
outputBufferBlockSize, convTypeLeft, sc->
regIDs[0], convTypeRight);
12887 sc->
tempLen = sprintf(sc->
tempStr,
" %s = (sdata[(combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride]);\n", sc->
regIDs[0], sc->
fftDim, sc->
fftDim);
12891 sc->
tempLen = sprintf(sc->
tempStr,
" %s[%s] = %s(%s.x)%s;\n", outputsStruct, sc->
inoutID, convTypeLeft, sc->
regIDs[0], convTypeRight);
12899 sc->
tempLen = sprintf(sc->
tempStr,
" outputBlocks[(%s %" PRIu64
")/ %" PRIu64
"]%s[(%s+%" PRIu64
") %% %" PRIu64
"] = %s(%s.y)%s;\n", sc->
inoutID, sc->
outputStride[1], sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputStride[1], sc->
outputBufferBlockSize, convTypeLeft, sc->
regIDs[0], convTypeRight);
12911 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride;\n", sc->
fftDim, sc->
fftDim);
12915 sc->
tempLen = sprintf(sc->
tempStr,
" %s[inoutID] = %s(sdata[sdataID].x)%s;\n", outputsStruct, convTypeLeft, convTypeRight);
12927 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
");\n", sc->
fftDim, sc->
fftDim);
12931 sc->
tempLen = sprintf(sc->
tempStr,
" %s[inoutID] = %s(sdata[sdataID].x)%s;\n", outputsStruct, convTypeLeft, convTypeRight);
12994 char shiftX[500] =
"";
12997 char shiftY[500] =
"";
13000 char shiftY2[500] =
"";
13002 sprintf(shiftY2,
" + consts.workGroupShiftY ");
13018 sdata[%s + %" PRIu64
"* sharedStride] = sdata[%s];\n\
13029 uint64_t num_out = (uint64_t)ceil(mult * (sc->
fftDim) / (double)sc->
localSize[1]);
13031 for (uint64_t i = 0; i < num_out; i++) {
13065 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = 0.5%s*(sdata[(combinedID / %" PRIu64
")* sharedStride + (combinedID %% %" PRIu64
")].x+sdata[(%" PRIu64
"-combinedID / %" PRIu64
")* sharedStride + (combinedID %% %" PRIu64
")].x);\n", sc->
regIDs[0], LFending, sc->
localSize[0], sc->
localSize[0], sc->
fftDim, sc->
localSize[0], sc->
localSize[0]);
13068 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = 0.5%s*(sdata[(combinedID / %" PRIu64
")* sharedStride + (combinedID %% %" PRIu64
")].y-sdata[(%" PRIu64
"-combinedID / %" PRIu64
")* sharedStride + (combinedID %% %" PRIu64
")].y);\n", sc->
regIDs[0], LFending, sc->
localSize[0], sc->
localSize[0], sc->
fftDim, sc->
localSize[0], sc->
localSize[0]);
13072 sc->
tempLen = sprintf(sc->
tempStr,
" %s[%s] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", outputsStruct, sc->
inoutID, convTypeLeft, sc->
regIDs[0], sc->
regIDs[0], convTypeRight);
13074 sc->
tempLen = sprintf(sc->
tempStr,
" outputBlocks[%s / %" PRIu64
"]%s[%s %% %" PRIu64
"] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", sc->
inoutID, sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputBufferBlockSize, convTypeLeft, sc->
regIDs[0], sc->
regIDs[0], convTypeRight);
13079 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = 0.5%s*(sdata[(combinedID / %" PRIu64
")* sharedStride + (combinedID %% %" PRIu64
")].y+sdata[(%" PRIu64
"-combinedID / %" PRIu64
")* sharedStride + (combinedID %% %" PRIu64
")].y);\n", sc->
regIDs[1], LFending, sc->
localSize[0], sc->
localSize[0], sc->
fftDim, sc->
localSize[0], sc->
localSize[0]);
13082 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = 0.5%s*(-sdata[(combinedID / %" PRIu64
")* sharedStride + (combinedID %% %" PRIu64
")].x+sdata[(%" PRIu64
"-combinedID / %" PRIu64
")* sharedStride + (combinedID %% %" PRIu64
")].x);\n", sc->
regIDs[1], LFending, sc->
localSize[0], sc->
localSize[0], sc->
fftDim, sc->
localSize[0], sc->
localSize[0]);
13089 sc->
tempLen = sprintf(sc->
tempStr,
" outputBlocks[(%s %" PRIu64
")/ %" PRIu64
"]%s[(%s+%" PRIu64
") %% %" PRIu64
"] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", sc->
inoutID, sc->
outputStride[1], sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputStride[1], sc->
outputBufferBlockSize, convTypeLeft, sc->
regIDs[1], sc->
regIDs[1], convTypeRight);
13100 sc->
tempLen = sprintf(sc->
tempStr,
" %s[%s] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", outputsStruct, sc->
inoutID, convTypeLeft, sc->
regIDs[0], sc->
regIDs[0], convTypeRight);
13102 sc->
tempLen = sprintf(sc->
tempStr,
" outputBlocks[%s / %" PRIu64
"]%s[%s %% %" PRIu64
"] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", sc->
inoutID, sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputBufferBlockSize, convTypeLeft, sc->
regIDs[0], sc->
regIDs[0], convTypeRight);
13108 sc->
tempLen = sprintf(sc->
tempStr,
" outputBlocks[(%s %" PRIu64
")/ %" PRIu64
"]%s[(%s+%" PRIu64
") %% %" PRIu64
"] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", sc->
inoutID, sc->
outputStride[1], sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputStride[1], sc->
outputBufferBlockSize, convTypeLeft, sc->
regIDs[1], sc->
regIDs[1], convTypeRight);
13116 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride;\n", sc->
localSize[0], sc->
localSize[0]);
13120 sc->
tempLen = sprintf(sc->
tempStr,
" %s[inoutID] = %s(sdata[sdataID].x)%s;\n", outputsStruct, convTypeLeft, convTypeRight);
13162 char shiftX[500] =
"";
13164 sprintf(shiftX,
" + consts.workGroupShiftX ");
13165 char shiftY[500] =
"";
13168 char shiftY2[500] =
"";
13170 sprintf(shiftY2,
" + consts.workGroupShiftY ");
13186 sdata[%s + %" PRIu64
"* sharedStride] = sdata[%s];\n\
13201 sdata[%s * sharedStride + %" PRIu64
"] = sdata[%s * sharedStride];\n\
13215 for (uint64_t i = 0; i < num_out; i++) {
13283 sc->
tempLen = sprintf(sc->
tempStr,
" mult.x = 2*%s(%.17f%s * (combinedID %% %" PRIu64
") );\n", cosDef, -double_PI / 2 / sc->
fftDim, LFending, sc->
fftDim / 2 + 1);
13286 sc->
tempLen = sprintf(sc->
tempStr,
" mult.y = 2*%s(%.17f%s * (combinedID %% %" PRIu64
") );\n", sinDef, -double_PI / 2 / sc->
fftDim, LFending, sc->
fftDim / 2 + 1);
13293 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = 0.5%s*(sdata[(combinedID %% %" PRIu64
")* sharedStride + (combinedID / %" PRIu64
")].x+sdata[(%" PRIu64
"-combinedID %% %" PRIu64
")* sharedStride + (combinedID / %" PRIu64
")].x);\n", sc->
regIDs[0], LFending, sc->
fftDim / 2 + 1, (sc->
fftDim / 2 + 1), sc->
fftDim, sc->
fftDim / 2 + 1, (sc->
fftDim / 2 + 1));
13296 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = 0.5%s*(sdata[(combinedID %% %" PRIu64
")* sharedStride + (combinedID / %" PRIu64
")].y-sdata[(%" PRIu64
"-combinedID %% %" PRIu64
")* sharedStride + (combinedID / %" PRIu64
")].y);\n", sc->
regIDs[0], LFending, sc->
fftDim / 2 + 1, (sc->
fftDim / 2 + 1), sc->
fftDim, sc->
fftDim / 2 + 1, (sc->
fftDim / 2 + 1));
13300 sc->
tempLen = sprintf(sc->
tempStr,
" %s[%s] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", outputsStruct, sc->
inoutID, convTypeLeft, sc->
regIDs[0], sc->
regIDs[0], convTypeRight);
13302 sc->
tempLen = sprintf(sc->
tempStr,
" outputBlocks[%s / %" PRIu64
"]%s[%s %% %" PRIu64
"] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", sc->
inoutID, sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputBufferBlockSize, convTypeLeft, sc->
regIDs[0], sc->
regIDs[0], convTypeRight);
13307 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = 0.5%s*(sdata[(combinedID %% %" PRIu64
")* sharedStride + (combinedID / %" PRIu64
")].y+sdata[(%" PRIu64
"-combinedID %% %" PRIu64
")* sharedStride + (combinedID / %" PRIu64
")].y);\n", sc->
regIDs[1], LFending, sc->
fftDim / 2 + 1, (sc->
fftDim / 2 + 1), sc->
fftDim, sc->
fftDim / 2 + 1, (sc->
fftDim / 2 + 1));
13310 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = 0.5%s*(-sdata[(combinedID %% %" PRIu64
")* sharedStride + (combinedID / %" PRIu64
")].x+sdata[(%" PRIu64
"-combinedID %% %" PRIu64
")* sharedStride + (combinedID / %" PRIu64
")].x);\n", sc->
regIDs[1], LFending, sc->
fftDim / 2 + 1, (sc->
fftDim / 2 + 1), sc->
fftDim, sc->
fftDim / 2 + 1, (sc->
fftDim / 2 + 1));
13317 sc->
tempLen = sprintf(sc->
tempStr,
" outputBlocks[(%s %" PRIu64
")/ %" PRIu64
"]%s[(%s+%" PRIu64
") %% %" PRIu64
"] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", sc->
inoutID, sc->
outputStride[1], sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputStride[1], sc->
outputBufferBlockSize, convTypeLeft, sc->
regIDs[1], sc->
regIDs[1], convTypeRight);
13325 sc->
tempLen = sprintf(sc->
tempStr,
" if(combinedID %% %" PRIu64
" > 0){\n", sc->
fftDim / 2 + 1);
13344 sc->
tempLen = sprintf(sc->
tempStr,
" %s[%s] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", outputsStruct, sc->
inoutID, convTypeLeft, sc->
regIDs[0], sc->
regIDs[0], convTypeRight);
13346 sc->
tempLen = sprintf(sc->
tempStr,
" outputBlocks[%s / %" PRIu64
"]%s[%s %% %" PRIu64
"] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", sc->
inoutID, sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputBufferBlockSize, convTypeLeft, sc->
regIDs[0], sc->
regIDs[0], convTypeRight);
13352 sc->
tempLen = sprintf(sc->
tempStr,
" outputBlocks[(%s %" PRIu64
")/ %" PRIu64
"]%s[(%s+%" PRIu64
") %% %" PRIu64
"] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", sc->
inoutID, sc->
outputStride[1], sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputStride[1], sc->
outputBufferBlockSize, convTypeLeft, sc->
regIDs[1], sc->
regIDs[1], convTypeRight);
13365 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = 0.5%s*(sdata[(combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].x+sdata[(%" PRIu64
"-combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].x);\n", sc->
regIDs[0], LFending, sc->
fftDim / 2 + 1, (sc->
fftDim / 2 + 1), sc->
fftDim, sc->
fftDim / 2 + 1, (sc->
fftDim / 2 + 1));
13368 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = 0.5%s*(sdata[(combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].y-sdata[(%" PRIu64
"-combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].y);\n", sc->
regIDs[0], LFending, sc->
fftDim / 2 + 1, (sc->
fftDim / 2 + 1), sc->
fftDim, sc->
fftDim / 2 + 1, (sc->
fftDim / 2 + 1));
13372 sc->
tempLen = sprintf(sc->
tempStr,
" %s[%s] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", outputsStruct, sc->
inoutID, convTypeLeft, sc->
regIDs[0], sc->
regIDs[0], convTypeRight);
13374 sc->
tempLen = sprintf(sc->
tempStr,
" outputBlocks[%s / %" PRIu64
"]%s[%s %% %" PRIu64
"] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", sc->
inoutID, sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputBufferBlockSize, convTypeLeft, sc->
regIDs[0], sc->
regIDs[0], convTypeRight);
13377 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = 0.5%s*(sdata[(combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].y+sdata[(%" PRIu64
"-combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].y);\n", sc->
regIDs[1], LFending, sc->
fftDim / 2 + 1, (sc->
fftDim / 2 + 1), sc->
fftDim, sc->
fftDim / 2 + 1, (sc->
fftDim / 2 + 1));
13380 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = 0.5%s*(-sdata[(combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].x+sdata[(%" PRIu64
"-combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].x);\n", sc->
regIDs[1], LFending, sc->
fftDim / 2 + 1, (sc->
fftDim / 2 + 1), sc->
fftDim, sc->
fftDim / 2 + 1, (sc->
fftDim / 2 + 1));
13387 sc->
tempLen = sprintf(sc->
tempStr,
" outputBlocks[(%s %" PRIu64
")/ %" PRIu64
"]%s[(%s+%" PRIu64
") %% %" PRIu64
"] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", sc->
inoutID, sc->
outputStride[1], sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputStride[1], sc->
outputBufferBlockSize, convTypeLeft, sc->
regIDs[1], sc->
regIDs[1], convTypeRight);
13395 sc->
tempLen = sprintf(sc->
tempStr,
" if(combinedID %% %" PRIu64
" > 0){\n", sc->
fftDim / 2 + 1);
13414 sc->
tempLen = sprintf(sc->
tempStr,
" %s[%s] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", outputsStruct, sc->
inoutID, convTypeLeft, sc->
regIDs[0], sc->
regIDs[0], convTypeRight);
13416 sc->
tempLen = sprintf(sc->
tempStr,
" outputBlocks[%s / %" PRIu64
"]%s[%s %% %" PRIu64
"] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", sc->
inoutID, sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputBufferBlockSize, convTypeLeft, sc->
regIDs[0], sc->
regIDs[0], convTypeRight);
13422 sc->
tempLen = sprintf(sc->
tempStr,
" outputBlocks[(%s %" PRIu64
")/ %" PRIu64
"]%s[(%s+%" PRIu64
") %% %" PRIu64
"] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", sc->
inoutID, sc->
outputStride[1], sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputStride[1], sc->
outputBufferBlockSize, convTypeLeft, sc->
regIDs[1], sc->
regIDs[1], convTypeRight);
13437 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride;\n", sc->
fftDim / 2 + 1, sc->
fftDim / 2 + 1);
13441 sc->
tempLen = sprintf(sc->
tempStr,
" %s[inoutID] = %s(sdata[sdataID].x*mult.x - sdata[sdataID].y*mult.y)%s;\n", outputsStruct, convTypeLeft, convTypeRight);
13451 sc->
tempLen = sprintf(sc->
tempStr,
" if(combinedID %% %" PRIu64
" > 0){\n", sc->
fftDim / 2 + 1);
13470 sc->
tempLen = sprintf(sc->
tempStr,
" %s[inoutID] = -%s(sdata[sdataID].y*mult.x + sdata[sdataID].x*mult.y)%s;\n", outputsStruct, convTypeLeft, convTypeRight);
13485 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (combinedID %% %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
");\n", sc->
fftDim / 2 + 1, sc->
fftDim / 2 + 1);
13489 sc->
tempLen = sprintf(sc->
tempStr,
" %s[inoutID] = %s(sdata[sdataID].x*mult.x -sdata[sdataID].y*mult.y)%s;\n", outputsStruct, convTypeLeft, convTypeRight);
13499 sc->
tempLen = sprintf(sc->
tempStr,
" if(combinedID %% %" PRIu64
" > 0){\n", sc->
fftDim / 2 + 1);
13518 sc->
tempLen = sprintf(sc->
tempStr,
" %s[inoutID] = -%s(sdata[sdataID].y*mult.x +sdata[sdataID].x*mult.y)%s;\n", outputsStruct, convTypeLeft, convTypeRight);
13583 char shiftX[500] =
"";
13586 char shiftY[500] =
"";
13589 char shiftY2[500] =
"";
13591 sprintf(shiftY2,
" + consts.workGroupShiftY ");
13606 sdata[%s + %" PRIu64
"* sharedStride] = sdata[%s];\n\
13617 uint64_t num_out = (uint64_t)ceil(mult * (sc->
fftDim / 2 + 1) / (double)sc->
localSize[1]);
13619 for (uint64_t i = 0; i < num_out; i++) {
13663 sc->
tempLen = sprintf(sc->
tempStr,
" mult.x = 2*%s(%.17f%s * (combinedID / %" PRIu64
") );\n", cosDef, -double_PI / 2 / sc->
fftDim, LFending, sc->
localSize[0]);
13666 sc->
tempLen = sprintf(sc->
tempStr,
" mult.y = 2*%s(%.17f%s * (combinedID / %" PRIu64
") );\n", sinDef, -double_PI / 2 / sc->
fftDim, LFending, sc->
localSize[0]);
13672 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = 0.5%s*(sdata[(combinedID / %" PRIu64
")* sharedStride + (combinedID %% %" PRIu64
")].x+sdata[(%" PRIu64
"-combinedID / %" PRIu64
")* sharedStride + (combinedID %% %" PRIu64
")].x);\n", sc->
regIDs[0], LFending, sc->
localSize[0], sc->
localSize[0], sc->
fftDim, sc->
localSize[0], sc->
localSize[0]);
13675 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = 0.5%s*(sdata[(combinedID / %" PRIu64
")* sharedStride + (combinedID %% %" PRIu64
")].y-sdata[(%" PRIu64
"-combinedID / %" PRIu64
")* sharedStride + (combinedID %% %" PRIu64
")].y);\n", sc->
regIDs[0], LFending, sc->
localSize[0], sc->
localSize[0], sc->
fftDim, sc->
localSize[0], sc->
localSize[0]);
13679 sc->
tempLen = sprintf(sc->
tempStr,
" %s[%s] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", outputsStruct, sc->
inoutID, convTypeLeft, sc->
regIDs[0], sc->
regIDs[0], convTypeRight);
13681 sc->
tempLen = sprintf(sc->
tempStr,
" outputBlocks[%s / %" PRIu64
"]%s[%s %% %" PRIu64
"] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", sc->
inoutID, sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputBufferBlockSize, convTypeLeft, sc->
regIDs[0], sc->
regIDs[0], convTypeRight);
13686 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = 0.5%s*(sdata[(combinedID / %" PRIu64
")* sharedStride + (combinedID %% %" PRIu64
")].y+sdata[(%" PRIu64
"-combinedID / %" PRIu64
")* sharedStride + (combinedID %% %" PRIu64
")].y);\n", sc->
regIDs[1], LFending, sc->
localSize[0], sc->
localSize[0], sc->
fftDim, sc->
localSize[0], sc->
localSize[0]);
13689 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = 0.5%s*(-sdata[(combinedID / %" PRIu64
")* sharedStride + (combinedID %% %" PRIu64
")].x+sdata[(%" PRIu64
"-combinedID / %" PRIu64
")* sharedStride + (combinedID %% %" PRIu64
")].x);\n", sc->
regIDs[1], LFending, sc->
localSize[0], sc->
localSize[0], sc->
fftDim, sc->
localSize[0], sc->
localSize[0]);
13696 sc->
tempLen = sprintf(sc->
tempStr,
" outputBlocks[(%s %" PRIu64
")/ %" PRIu64
"]%s[(%s+%" PRIu64
") %% %" PRIu64
"] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", sc->
inoutID, sc->
outputStride[1], sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputStride[1], sc->
outputBufferBlockSize, convTypeLeft, sc->
regIDs[1], sc->
regIDs[1], convTypeRight);
13707 sc->
tempLen = sprintf(sc->
tempStr,
" %s[%s] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", outputsStruct, sc->
inoutID, convTypeLeft, sc->
regIDs[0], sc->
regIDs[0], convTypeRight);
13709 sc->
tempLen = sprintf(sc->
tempStr,
" outputBlocks[%s / %" PRIu64
"]%s[%s %% %" PRIu64
"] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", sc->
inoutID, sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputBufferBlockSize, convTypeLeft, sc->
regIDs[0], sc->
regIDs[0], convTypeRight);
13715 sc->
tempLen = sprintf(sc->
tempStr,
" outputBlocks[(%s %" PRIu64
")/ %" PRIu64
"]%s[(%s+%" PRIu64
") %% %" PRIu64
"] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", sc->
inoutID, sc->
outputStride[1], sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputStride[1], sc->
outputBufferBlockSize, convTypeLeft, sc->
regIDs[1], sc->
regIDs[1], convTypeRight);
13723 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (combinedID %% %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride;\n", sc->
localSize[0], sc->
localSize[0]);
13727 sc->
tempLen = sprintf(sc->
tempStr,
" %s[inoutID] = %s(sdata[sdataID].x*mult.x -sdata[sdataID].y*mult.y)%s;\n", outputsStruct, convTypeLeft, convTypeRight);
13756 sc->
tempLen = sprintf(sc->
tempStr,
" %s[inoutID] = -%s(sdata[sdataID].y*mult.x +sdata[sdataID].x*mult.y)%s;\n", outputsStruct, convTypeLeft, convTypeRight);
13800 char shiftX[500] =
"";
13802 sprintf(shiftX,
" + consts.workGroupShiftX ");
13803 char shiftY[500] =
"";
13806 char shiftY2[500] =
"";
13808 sprintf(shiftY2,
" + consts.workGroupShiftY ");
13818 uint64_t maxBluesteinCutOff = 1;
13835 sc->
tempLen = sprintf(sc->
tempStr,
" if(combinedID < %" PRIu64
"){\n", maxBluesteinCutOff);
13881 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (((combinedID %% %" PRIu64
") %% 2) * %" PRIu64
" + (1-2*((combinedID %% %" PRIu64
") %% 2)) * ((combinedID %% %" PRIu64
")/2)) * sharedStride + (combinedID / %" PRIu64
");\n", sc->
fftDim, sc->
fftDim - 1, sc->
fftDim, sc->
fftDim, sc->
fftDim);
13886 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (((combinedID %% %" PRIu64
") %% 2) * %" PRIu64
" + (1-2*((combinedID %% %" PRIu64
") %% 2)) * ((combinedID %% %" PRIu64
")/2)) + (combinedID / %" PRIu64
")* sharedStride;\n", sc->
fftDim, sc->
fftDim - 1, sc->
fftDim, sc->
fftDim, sc->
fftDim);
13892 sc->
tempLen = sprintf(sc->
tempStr,
" %s[%s] = %s(sdata[sdataID].x)%s;\n", outputsStruct, sc->
inoutID, convTypeLeft, convTypeRight);
13901 sc->
tempLen = sprintf(sc->
tempStr,
" %s[%s] = %s(sdata[sdataID].y)%s;\n", outputsStruct, sc->
inoutID, convTypeLeft, convTypeRight);
13909 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (((combinedID %% %" PRIu64
") %% 2) * %" PRIu64
" + (1-2*((combinedID %% %" PRIu64
") %% 2)) * ((combinedID %% %" PRIu64
")/2)) + (combinedID / %" PRIu64
") * sharedStride;\n", sc->
fftDim, sc->
fftDim - 1, sc->
fftDim, sc->
fftDim, sc->
fftDim);
13913 sc->
tempLen = sprintf(sc->
tempStr,
" %s[inoutID] = %s(sdata[sdataID].x)%s;\n", outputsStruct, convTypeLeft, convTypeRight);
13920 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (((combinedID %% %" PRIu64
") %% 2) * %" PRIu64
" + (1-2*((combinedID %% %" PRIu64
") %% 2)) * ((combinedID %% %" PRIu64
")/2)) * sharedStride + (combinedID / %" PRIu64
");\n", sc->
fftDim, sc->
fftDim - 1, sc->
fftDim, sc->
fftDim, sc->
fftDim);
13924 sc->
tempLen = sprintf(sc->
tempStr,
" %s[inoutID] = %s(sdata[sdataID].x)%s;\n", outputsStruct, convTypeLeft, convTypeRight);
13976 char shiftX[500] =
"";
13979 char shiftY[500] =
"";
13982 char shiftY2[500] =
"";
13984 sprintf(shiftY2,
" + consts.workGroupShiftY ");
13999 sdata[%s + %" PRIu64
"* sharedStride] = sdata[%s];\n\
14048 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = 0.5%s*(sdata[(combinedID / %" PRIu64
")* sharedStride + (combinedID %% %" PRIu64
")].x+sdata[(%" PRIu64
"-combinedID / %" PRIu64
")* sharedStride + (combinedID %% %" PRIu64
")].x);\n", sc->
regIDs[0], LFending, sc->
localSize[0], sc->
localSize[0], sc->
fftDim, sc->
localSize[0], sc->
localSize[0]);
14051 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = 0.5%s*(sdata[(combinedID / %" PRIu64
")* sharedStride + (combinedID %% %" PRIu64
")].y-sdata[(%" PRIu64
"-combinedID / %" PRIu64
")* sharedStride + (combinedID %% %" PRIu64
")].y);\n", sc->
regIDs[0], LFending, sc->
localSize[0], sc->
localSize[0], sc->
fftDim, sc->
localSize[0], sc->
localSize[0]);
14055 sc->
tempLen = sprintf(sc->
tempStr,
" %s[%s] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", outputsStruct, sc->
inoutID, convTypeLeft, sc->
regIDs[0], sc->
regIDs[0], convTypeRight);
14057 sc->
tempLen = sprintf(sc->
tempStr,
" outputBlocks[%s / %" PRIu64
"]%s[%s %% %" PRIu64
"] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", sc->
inoutID, sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputBufferBlockSize, convTypeLeft, sc->
regIDs[0], sc->
regIDs[0], convTypeRight);
14062 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = 0.5%s*(sdata[(combinedID / %" PRIu64
")* sharedStride + (combinedID %% %" PRIu64
")].y+sdata[(%" PRIu64
"-combinedID / %" PRIu64
")* sharedStride + (combinedID %% %" PRIu64
")].y);\n", sc->
regIDs[1], LFending, sc->
localSize[0], sc->
localSize[0], sc->
fftDim, sc->
localSize[0], sc->
localSize[0]);
14065 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = 0.5%s*(-sdata[(combinedID / %" PRIu64
")* sharedStride + (combinedID %% %" PRIu64
")].x+sdata[(%" PRIu64
"-combinedID / %" PRIu64
")* sharedStride + (combinedID %% %" PRIu64
")].x);\n", sc->
regIDs[1], LFending, sc->
localSize[0], sc->
localSize[0], sc->
fftDim, sc->
localSize[0], sc->
localSize[0]);
14072 sc->
tempLen = sprintf(sc->
tempStr,
" outputBlocks[(%s %" PRIu64
")/ %" PRIu64
"]%s[(%s+%" PRIu64
") %% %" PRIu64
"] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", sc->
inoutID, sc->
outputStride[1], sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputStride[1], sc->
outputBufferBlockSize, convTypeLeft, sc->
regIDs[1], sc->
regIDs[1], convTypeRight);
14083 sc->
tempLen = sprintf(sc->
tempStr,
" %s[%s] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", outputsStruct, sc->
inoutID, convTypeLeft, sc->
regIDs[0], sc->
regIDs[0], convTypeRight);
14085 sc->
tempLen = sprintf(sc->
tempStr,
" outputBlocks[%s / %" PRIu64
"]%s[%s %% %" PRIu64
"] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", sc->
inoutID, sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputBufferBlockSize, convTypeLeft, sc->
regIDs[0], sc->
regIDs[0], convTypeRight);
14091 sc->
tempLen = sprintf(sc->
tempStr,
" outputBlocks[(%s %" PRIu64
")/ %" PRIu64
"]%s[(%s+%" PRIu64
") %% %" PRIu64
"] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", sc->
inoutID, sc->
outputStride[1], sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputStride[1], sc->
outputBufferBlockSize, convTypeLeft, sc->
regIDs[1], sc->
regIDs[1], convTypeRight);
14099 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (((combinedID / %" PRIu64
") %% 2) * %" PRIu64
" + (1-2*((combinedID / %" PRIu64
") %% 2)) * ((combinedID / %" PRIu64
")/2)) * sharedStride + (combinedID %% %" PRIu64
");\n", sc->
localSize[0], sc->
fftDim - 1, sc->
localSize[0], sc->
localSize[0], sc->
localSize[0]);
14103 sc->
tempLen = sprintf(sc->
tempStr,
" %s[inoutID] = %s(sdata[sdataID].x)%s;\n", outputsStruct, convTypeLeft, convTypeRight);
14149 char shiftX[500] =
"";
14151 sprintf(shiftX,
" + consts.workGroupShiftX ");
14152 char shiftY[500] =
"";
14162 char shiftY2[100] =
"";
14164 sprintf(shiftY,
" + consts.workGroupShiftY ");
14173 sc->
tempLen = sprintf(sc->
tempStr,
" if (((%s + %" PRIu64
" * %s) %% %" PRIu64
" + ((%s%s) / %" PRIu64
")*%" PRIu64
" < %" PRIu64
")){\n", sc->
gl_LocalInvocationID_x, sc->
localSize[0], sc->
gl_LocalInvocationID_y, sc->
localSize[0], sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
localSize[0], sc->
fft_dim_full / sc->
firstStageStartSize);
14179 sc->
tempLen = sprintf(sc->
tempStr,
" if (((%s + %" PRIu64
" * %s) %% %" PRIu64
" + ((%s%s) / %" PRIu64
")*%" PRIu64
" < %" PRIu64
")){\n", sc->
gl_LocalInvocationID_x, sc->
localSize[0], sc->
gl_LocalInvocationID_y, sc->
localSize[1], sc->
gl_WorkGroupID_x, shiftX, sc->
firstStageStartSize / sc->
fftDim, sc->
localSize[1], sc->
fft_dim_full / sc->
firstStageStartSize);
14252 sc->
tempLen = sprintf(sc->
tempStr,
" %s[%s] = %ssdata[(2*(combinedID %% %" PRIu64
")+1) * sharedStride + (combinedID / %" PRIu64
")].x/2%s;\n", outputsStruct, sc->
inoutID, convTypeLeft, sc->
fftDim / 8, sc->
fftDim / 8, convTypeRight);
14254 sc->
tempLen = sprintf(sc->
tempStr,
" outputBlocks[%s / %" PRIu64
"]%s[%s %% %" PRIu64
"] = %ssdata[(2*(combinedID %% %" PRIu64
")+1) * sharedStride + (combinedID / %" PRIu64
")].x/2%s;\n", sc->
inoutID, sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputBufferBlockSize, convTypeLeft, sc->
fftDim / 8, sc->
fftDim / 8, convTypeRight);
14260 sc->
tempLen = sprintf(sc->
tempStr,
" %s[%s] = %ssdata[2*(combinedID %% %" PRIu64
")+1 + (combinedID / %" PRIu64
") * sharedStride].x/2%s;\n", outputsStruct, sc->
inoutID, convTypeLeft, sc->
fftDim / 8, sc->
fftDim / 8, convTypeRight);
14262 sc->
tempLen = sprintf(sc->
tempStr,
" outputBlocks[%s / %" PRIu64
"]%s[%s %% %" PRIu64
"] = %ssdata[2*(combinedID %% %" PRIu64
")+1 + (combinedID / %" PRIu64
") * sharedStride].x/2%s;\n", sc->
inoutID, sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputBufferBlockSize, convTypeLeft, sc->
fftDim / 8, sc->
fftDim / 8, convTypeRight);
14549 char shiftX[500] =
"";
14553 sc->
tempLen = sprintf(sc->
tempStr,
" if (((%s%s) / %" PRIu64
") %% (%" PRIu64
")+((%s%s) / %" PRIu64
") * (%" PRIu64
") < %" PRIu64
") {\n", sc->
gl_GlobalInvocationID_x, shiftX, sc->
fft_dim_x, sc->
stageStartSize, sc->
gl_GlobalInvocationID_x, shiftX, sc->
fft_dim_x * sc->
stageStartSize, sc->
fftDim * sc->
stageStartSize, sc->
size[sc->
axis_id]);
14564 sc->
tempLen = sprintf(sc->
tempStr,
" inoutID = (%s + %" PRIu64
") * (%" PRIu64
") + (((%s%s) / %" PRIu64
") %% (%" PRIu64
")) * (%" PRIu64
") + ((%s%s) / %" PRIu64
");\n", sc->
gl_LocalInvocationID_y, (i + k * sc->
min_registers_per_thread) * sc->
localSize[1], sc->
fft_dim_full / sc->
fftDim, sc->
gl_GlobalInvocationID_x, shiftX, sc->
fft_dim_x, sc->
firstStageStartSize / sc->
fftDim, sc->
fft_dim_full / sc->
firstStageStartSize, sc->
gl_GlobalInvocationID_x, shiftX, sc->
fft_dim_x * (sc->
firstStageStartSize / sc->
fftDim));
14587 sc->
tempLen = sprintf(sc->
tempStr,
" outputBlocks[%s / %" PRIu64
"]%s[%s %% %" PRIu64
"] = %ssdata[%s*(2*(%s+%" PRIu64
")+1) + %s].x/2%s;\n", sc->
inoutID, sc->
outputBufferBlockSize, outputsStruct, sc->
inoutID, sc->
outputBufferBlockSize, convTypeLeft, sc->
sharedStride, sc->
gl_LocalInvocationID_y, (i + k * sc->
min_registers_per_thread) * sc->
localSize[1], sc->
gl_LocalInvocationID_x, convTypeRight);
14662 char shiftX[500] =
"";
14664 sprintf(shiftX,
" + consts.workGroupShiftX ");
14665 char shiftY[500] =
"";
14668 char shiftY2[500] =
"";
14670 sprintf(shiftY2,
" + consts.workGroupShiftY ");
14679 uint64_t maxBluesteinCutOff = 1;
14696 sc->
tempLen = sprintf(sc->
tempStr,
" if(combinedID < %" PRIu64
"){\n", maxBluesteinCutOff);
14701 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (((combinedID %% %" PRIu64
") %% 2) * %" PRIu64
" + (1-2*((combinedID %% %" PRIu64
") %% 2)) * ((combinedID %% %" PRIu64
")/2)) * sharedStride + (combinedID / %" PRIu64
");\n", sc->
fftDim, sc->
fftDim - 1, sc->
fftDim, sc->
fftDim, sc->
fftDim);
14706 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (((combinedID %% %" PRIu64
") %% 2) * %" PRIu64
" + (1-2*((combinedID %% %" PRIu64
") %% 2)) * ((combinedID %% %" PRIu64
")/2)) + (combinedID / %" PRIu64
")* sharedStride;\n", sc->
fftDim, sc->
fftDim - 1, sc->
fftDim, sc->
fftDim, sc->
fftDim);
14760 res =
indexOutputVkFFT(sc, uintType, writeType, index_x, 0, requestCoordinate, requestBatch);
14772 sc->
tempLen = sprintf(sc->
tempStr,
" mult.x = %s(%.17f%s * (2*(combinedID %% %" PRIu64
")+1) );\n", cosDef, -double_PI / 8 / sc->
fftDim, LFending, sc->
fftDim);
14775 sc->
tempLen = sprintf(sc->
tempStr,
" mult.y = %s(%.17f%s * (2*(combinedID %% %" PRIu64
")+1) );\n", sinDef, -double_PI / 8 / sc->
fftDim, LFending, sc->
fftDim);
14795 sc->
tempLen = sprintf(index_x,
"%" PRIu64
" - combinedID %% %" PRIu64
" + ((combinedID/%" PRIu64
") * %" PRIu64
")", 2 * sc->
fftDim - 1, sc->
fftDim, sc->
fftDim, sc->
outputStride[1]);
14799 res =
indexOutputVkFFT(sc, uintType, writeType, index_x, 0, requestCoordinate, requestBatch);
14867 char shiftX[500] =
"";
14869 sprintf(shiftX,
" + consts.workGroupShiftX ");
14870 char shiftX2[500] =
"";
14873 char shiftY[500] =
"";
14876 char shiftY2[500] =
"";
14878 sprintf(shiftY2,
" + consts.workGroupShiftY ");
14901 sc->
tempLen = sprintf(sc->
tempStr,
" sdataID = (((combinedID / %" PRIu64
") %% 2) * %" PRIu64
" + (1-2*((combinedID / %" PRIu64
") %% 2)) * ((combinedID / %" PRIu64
")/2)) * sharedStride + (combinedID %% %" PRIu64
");\n", sc->
localSize[0], sc->
fftDim - 1, sc->
localSize[0], sc->
localSize[0], sc->
localSize[0]);
14946 res =
indexOutputVkFFT(sc, uintType, writeType, index_x, index_y, requestCoordinate, requestBatch);
14959 sc->
tempLen = sprintf(sc->
tempStr,
" mult.x = %s(%.17f%s * (2*(combinedID / %" PRIu64
")+1) );\n", cosDef, -double_PI / 8 / sc->
fftDim, LFending, sc->
localSize[0]);
14962 sc->
tempLen = sprintf(sc->
tempStr,
" mult.y = %s(%.17f%s * (2*(combinedID / %" PRIu64
")+1) );\n", sinDef, -double_PI / 8 / sc->
fftDim, LFending, sc->
localSize[0]);
14987 res =
indexOutputVkFFT(sc, uintType, writeType, index_x, index_y, requestCoordinate, requestBatch);
15043 char shiftX[500] =
"";
15045 sprintf(shiftX,
" + consts.workGroupShiftX ");
15046 char shiftY[500] =
"";
15049 char shiftY2[500] =
"";
15051 sprintf(shiftY2,
" + consts.workGroupShiftY ");
15067 sdata[%s + %" PRIu64
"* sharedStride] = sdata[%s];\n\
15082 sdata[%s * sharedStride + %" PRIu64
"] = sdata[%s * sharedStride];\n\
15109 if ((uint64_t)ceil(sc->
size[1] / (
double)mult) % sc->
localSize[0] != 0) {
15121 if ((uint64_t)ceil(sc->
size[1] / (
double)mult) % sc->
localSize[1] != 0) {
15164 sc->
tempLen = sprintf(sc->
tempStr,
"if ( (combinedID / %" PRIu64
") %% 2 == 0){\n", sc->
fftDim);
15167 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = 0.5%s*(sdata[(2*sdataID+1) * sharedStride + (combinedID / %" PRIu64
")].x+sdata[(%" PRIu64
"- (2*sdataID+1)) * sharedStride + (combinedID / %" PRIu64
")].x);\n", sc->
regIDs[0], LFending, mult*sc->
fftDim, sc->
fftDim, mult * sc->
fftDim);
15170 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = 0.5%s*(sdata[(2*sdataID+1) * sharedStride + (combinedID / %" PRIu64
")].y-sdata[(%" PRIu64
"- (2*sdataID+1)) * sharedStride + (combinedID / %" PRIu64
")].y);\n", sc->
regIDs[0], LFending, mult * sc->
fftDim, sc->
fftDim, mult * sc->
fftDim);
15176 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = 0.5%s*(sdata[(2*sdataID+1) * sharedStride + (combinedID / %" PRIu64
")].y+sdata[(%" PRIu64
"- (2*sdataID+1)) * sharedStride + (combinedID / %" PRIu64
")].y);\n", sc->
regIDs[0], LFending, mult * sc->
fftDim, sc->
fftDim, mult * sc->
fftDim);
15179 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = 0.5%s*(-sdata[(2*sdataID+1) * sharedStride + (combinedID / %" PRIu64
")].x+sdata[(%" PRIu64
"- (2*sdataID+1)) * sharedStride + (combinedID / %" PRIu64
")].x);\n", sc->
regIDs[0], LFending, mult * sc->
fftDim, sc->
fftDim, mult * sc->
fftDim);
15187 sc->
tempLen = sprintf(sc->
tempStr,
"if ( (combinedID / %" PRIu64
") %% 2 == 0){\n", sc->
fftDim);
15190 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = 0.5%s*(sdata[(2*sdataID+1) + (combinedID / %" PRIu64
") * sharedStride].x+sdata[(%" PRIu64
"- (2*sdataID+1)) + (combinedID / %" PRIu64
") * sharedStride].x);\n", sc->
regIDs[0], LFending, mult * sc->
fftDim, sc->
fftDim, mult * sc->
fftDim);
15193 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = 0.5%s*(sdata[(2*sdataID+1) + (combinedID / %" PRIu64
") * sharedStride].y-sdata[(%" PRIu64
"- (2*sdataID+1)) + (combinedID / %" PRIu64
") * sharedStride].y);\n", sc->
regIDs[0], LFending, mult * sc->
fftDim, sc->
fftDim, mult * sc->
fftDim);
15199 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = 0.5%s*(sdata[(2*sdataID+1) + (combinedID / %" PRIu64
") * sharedStride].y+sdata[(%" PRIu64
"- (2*sdataID+1)) + (combinedID / %" PRIu64
") * sharedStride].y);\n", sc->
regIDs[0], LFending, mult * sc->
fftDim, sc->
fftDim, mult * sc->
fftDim);
15202 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = 0.5%s*(-sdata[(2*sdataID+1) + (combinedID / %" PRIu64
") * sharedStride].x+sdata[(%" PRIu64
"- (2*sdataID+1)) + (combinedID / %" PRIu64
") * sharedStride].x);\n", sc->
regIDs[0], LFending, mult * sc->
fftDim, sc->
fftDim, mult * sc->
fftDim);
15212 sc->
tempLen = sprintf(sc->
tempStr,
" %s = sdata[(2*sdataID+1) + (combinedID / %" PRIu64
") * sharedStride];\n", sc->
regIDs[0], sc->
fftDim);
15214 sc->
tempLen = sprintf(sc->
tempStr,
" %s = sdata[(2*sdataID+1) * sharedStride + (combinedID / %" PRIu64
")];\n", sc->
regIDs[0], sc->
fftDim);
15218 sc->
tempLen = sprintf(sc->
tempStr,
" if ((((sdataID + 1)/2) %% 2) != 0) \n\
15224 sc->
tempLen = sprintf(sc->
tempStr,
" if ((((sdataID)/2) %% 2) != 0) \n\
15235 sc->
tempLen = sprintf(sc->
tempStr,
" if((sdataID < %" PRIu64
")&&(sdataID >= %" PRIu64
")){\n", sc->
fftDim/2, sc->
fftDim/4);
15240 sc->
tempLen = sprintf(sc->
tempStr,
"if ( (combinedID / %" PRIu64
") %% 2 == 0){\n", sc->
fftDim);
15243 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = 0.5%s*(sdata[(%" PRIu64
" - 2*sdataID) * sharedStride + (combinedID / %" PRIu64
")].x+sdata[(%" PRIu64
" + 2*sdataID) * sharedStride + (combinedID / %" PRIu64
")].x);\n", sc->
regIDs[0], LFending, 2 * (sc->
fftDim / 2), mult * sc->
fftDim, sc->
fftDim - 2 * (sc->
fftDim / 2), mult * sc->
fftDim);
15246 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = 0.5%s*(sdata[(%" PRIu64
" - 2*sdataID) * sharedStride + (combinedID / %" PRIu64
")].y-sdata[(%" PRIu64
" + 2*sdataID) * sharedStride + (combinedID / %" PRIu64
")].y);\n", sc->
regIDs[0], LFending, 2 * (sc->
fftDim / 2), mult * sc->
fftDim, sc->
fftDim - 2 * (sc->
fftDim / 2), mult * sc->
fftDim);
15252 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = 0.5%s*(sdata[(%" PRIu64
" - 2*sdataID) * sharedStride + (combinedID / %" PRIu64
")].y+sdata[(%" PRIu64
" + 2*sdataID) * sharedStride + (combinedID / %" PRIu64
")].y);\n", sc->
regIDs[0], LFending, 2 * (sc->
fftDim / 2), mult * sc->
fftDim, sc->
fftDim - 2 * (sc->
fftDim / 2), mult * sc->
fftDim);
15255 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = 0.5%s*(-sdata[(%" PRIu64
" - 2*sdataID) * sharedStride + (combinedID / %" PRIu64
")].x+sdata[(%" PRIu64
" + 2*sdataID) * sharedStride + (combinedID / %" PRIu64
")].x);\n", sc->
regIDs[0], LFending, 2 * (sc->
fftDim / 2), mult * sc->
fftDim, sc->
fftDim - 2 * (sc->
fftDim / 2), mult * sc->
fftDim);
15263 sc->
tempLen = sprintf(sc->
tempStr,
"if ( (combinedID / %" PRIu64
") %% 2 == 0){\n", sc->
fftDim);
15266 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = 0.5%s*(sdata[(%" PRIu64
" - 2*sdataID) + (combinedID / %" PRIu64
") * sharedStride].x+sdata[(%" PRIu64
" + 2*sdataID) + (combinedID / %" PRIu64
") * sharedStride].x);\n", sc->
regIDs[0], LFending, 2 * (sc->
fftDim / 2), mult * sc->
fftDim, sc->
fftDim - 2 * (sc->
fftDim / 2), mult * sc->
fftDim);
15269 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = 0.5%s*(sdata[(%" PRIu64
" - 2*sdataID) + (combinedID / %" PRIu64
") * sharedStride].y-sdata[(%" PRIu64
" + 2*sdataID) + (combinedID / %" PRIu64
") * sharedStride].y);\n", sc->
regIDs[0], LFending, 2 * (sc->
fftDim / 2), mult * sc->
fftDim, sc->
fftDim - 2 * (sc->
fftDim / 2), mult * sc->
fftDim);
15275 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = 0.5%s*(sdata[(%" PRIu64
" - 2*sdataID) + (combinedID / %" PRIu64
") * sharedStride].y+sdata[(%" PRIu64
" + 2*sdataID) + (combinedID / %" PRIu64
") * sharedStride].y);\n", sc->
regIDs[0], LFending, 2 * (sc->
fftDim / 2), mult * sc->
fftDim, sc->
fftDim - 2 * (sc->
fftDim / 2), mult * sc->
fftDim);
15278 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = 0.5%s*(-sdata[(%" PRIu64
" - 2*sdataID) + (combinedID / %" PRIu64
") * sharedStride].x+sdata[(%" PRIu64
" + 2*sdataID) + (combinedID / %" PRIu64
") * sharedStride].x);\n", sc->
regIDs[0], LFending, 2 * (sc->
fftDim / 2), mult * sc->
fftDim, sc->
fftDim - 2 * (sc->
fftDim / 2), mult * sc->
fftDim);
15288 sc->
tempLen = sprintf(sc->
tempStr,
" %s = sdata[(%" PRIu64
" - 2*sdataID) + (combinedID / %" PRIu64
") * sharedStride];\n", sc->
regIDs[0], 2 * (sc->
fftDim / 2), sc->
fftDim);
15290 sc->
tempLen = sprintf(sc->
tempStr,
" %s = sdata[(%" PRIu64
" - 2*sdataID) * sharedStride + (combinedID / %" PRIu64
")];\n", sc->
regIDs[0], 2 * (sc->
fftDim / 2), sc->
fftDim);
15294 sc->
tempLen = sprintf(sc->
tempStr,
" if ((((sdataID+1)/2) %% 2) != 0) \n\
15300 sc->
tempLen = sprintf(sc->
tempStr,
" if ((((sdataID)/2) %% 2) != 0) \n\
15311 sc->
tempLen = sprintf(sc->
tempStr,
" if((sdataID < %" PRIu64
")&&(sdataID >= %" PRIu64
")){\n", 3 * sc->
fftDim / 4, sc->
fftDim / 2);
15316 sc->
tempLen = sprintf(sc->
tempStr,
"if ( (combinedID / %" PRIu64
") %% 2 == 0){\n", sc->
fftDim);
15319 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = 0.5%s*(sdata[(2*sdataID - %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
")].x+sdata[(%" PRIu64
" - 2*sdataID) * sharedStride + (combinedID / %" PRIu64
")].x);\n", sc->
regIDs[0], LFending, 2 * (sc->
fftDim / 2), mult * sc->
fftDim, sc->
fftDim + 2 * (sc->
fftDim / 2), mult * sc->
fftDim);
15322 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = 0.5%s*(sdata[(2*sdataID - %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
")].y-sdata[(%" PRIu64
" - 2*sdataID) * sharedStride + (combinedID / %" PRIu64
")].y);\n", sc->
regIDs[0], LFending, 2 * (sc->
fftDim / 2), mult * sc->
fftDim, sc->
fftDim + 2 * (sc->
fftDim / 2), mult * sc->
fftDim);
15328 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = 0.5%s*(sdata[(2*sdataID - %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
")].y+sdata[(%" PRIu64
" - 2*sdataID) * sharedStride + (combinedID / %" PRIu64
")].y);\n", sc->
regIDs[0], LFending, 2 * (sc->
fftDim / 2), mult * sc->
fftDim, sc->
fftDim + 2 * (sc->
fftDim / 2), mult * sc->
fftDim);
15331 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = 0.5%s*(-sdata[(2*sdataID - %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
")].x+sdata[(%" PRIu64
" - 2*sdataID) * sharedStride + (combinedID / %" PRIu64
")].x);\n", sc->
regIDs[0], LFending, 2 * (sc->
fftDim / 2), mult * sc->
fftDim, sc->
fftDim + 2 * (sc->
fftDim / 2), mult * sc->
fftDim);
15339 sc->
tempLen = sprintf(sc->
tempStr,
"if ( (combinedID / %" PRIu64
") %% 2 == 0){\n", sc->
fftDim);
15342 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = 0.5%s*(sdata[(2*sdataID - %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].x+sdata[(%" PRIu64
" - 2*sdataID) + (combinedID / %" PRIu64
") * sharedStride].x);\n", sc->
regIDs[0], LFending, 2 * (sc->
fftDim / 2), mult * sc->
fftDim, sc->
fftDim + 2 * (sc->
fftDim / 2), mult * sc->
fftDim);
15345 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = 0.5%s*(sdata[(2*sdataID - %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].y-sdata[(%" PRIu64
" - 2*sdataID) + (combinedID / %" PRIu64
") * sharedStride].y);\n", sc->
regIDs[0], LFending, 2 * (sc->
fftDim / 2), mult * sc->
fftDim, sc->
fftDim + 2 * (sc->
fftDim / 2), mult * sc->
fftDim);
15351 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = 0.5%s*(sdata[(2*sdataID - %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].y+sdata[(%" PRIu64
" - 2*sdataID) + (combinedID / %" PRIu64
") * sharedStride].y);\n", sc->
regIDs[0], LFending, 2 * (sc->
fftDim / 2), mult * sc->
fftDim, sc->
fftDim + 2 * (sc->
fftDim / 2), mult * sc->
fftDim);
15354 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = 0.5%s*(-sdata[(2*sdataID - %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].x+sdata[(%" PRIu64
" - 2*sdataID) + (combinedID / %" PRIu64
") * sharedStride].x);\n", sc->
regIDs[0], LFending, 2 * (sc->
fftDim / 2), mult * sc->
fftDim, sc->
fftDim + 2 * (sc->
fftDim / 2), mult * sc->
fftDim);
15364 sc->
tempLen = sprintf(sc->
tempStr,
" %s = sdata[(2*sdataID - %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride];\n", sc->
regIDs[0], 2 * (sc->
fftDim / 2), sc->
fftDim);
15366 sc->
tempLen = sprintf(sc->
tempStr,
" %s = sdata[(2*sdataID - %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
")];\n", sc->
regIDs[0], 2 * (sc->
fftDim / 2), sc->
fftDim);
15370 sc->
tempLen = sprintf(sc->
tempStr,
" if ((((sdataID+1)/2) %% 2) != 0) \n\
15376 sc->
tempLen = sprintf(sc->
tempStr,
" if ((((sdataID)/2) %% 2) != 0) \n\
15392 sc->
tempLen = sprintf(sc->
tempStr,
"if ( (combinedID / %" PRIu64
") %% 2 == 0){\n", sc->
fftDim);
15395 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = 0.5%s*(sdata[(%" PRIu64
" - 2*sdataID) * sharedStride + (combinedID / %" PRIu64
")].x+sdata[(2*sdataID - %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
")].x);\n", sc->
regIDs[0], LFending, 2 * sc->
fftDim - 1, mult * sc->
fftDim, sc->
fftDim - 1, mult * sc->
fftDim);
15398 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = 0.5%s*(sdata[(%" PRIu64
" - 2*sdataID) * sharedStride + (combinedID / %" PRIu64
")].y-sdata[(2*sdataID - %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
")].y);\n", sc->
regIDs[0], LFending, 2 * sc->
fftDim - 1, mult * sc->
fftDim, sc->
fftDim - 1, mult * sc->
fftDim);
15404 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = 0.5%s*(sdata[(%" PRIu64
" - 2*sdataID) * sharedStride + (combinedID / %" PRIu64
")].y+sdata[(2*sdataID - %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
")].y);\n", sc->
regIDs[0], LFending, 2 * sc->
fftDim - 1, mult * sc->
fftDim, sc->
fftDim - 1, mult * sc->
fftDim);
15407 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = 0.5%s*(-sdata[(%" PRIu64
" - 2*sdataID) * sharedStride + (combinedID / %" PRIu64
")].x+sdata[(2*sdataID - %" PRIu64
") * sharedStride + (combinedID / %" PRIu64
")].x);\n", sc->
regIDs[0], LFending, 2 * sc->
fftDim - 1, mult * sc->
fftDim, sc->
fftDim - 1, mult * sc->
fftDim);
15415 sc->
tempLen = sprintf(sc->
tempStr,
"if ( (combinedID / %" PRIu64
") %% 2 == 0){\n", sc->
fftDim);
15418 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = 0.5%s*(sdata[(%" PRIu64
" - 2*sdataID) + (combinedID / %" PRIu64
") * sharedStride].x+sdata[(2*sdataID - %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].x);\n", sc->
regIDs[0], LFending, 2 * sc->
fftDim - 1, mult * sc->
fftDim, sc->
fftDim - 1, mult * sc->
fftDim);
15421 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = 0.5%s*(sdata[(%" PRIu64
" - 2*sdataID) + (combinedID / %" PRIu64
") * sharedStride].y-sdata[(2*sdataID - %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].y);\n", sc->
regIDs[0], LFending, 2 * sc->
fftDim - 1, mult * sc->
fftDim, sc->
fftDim - 1, mult * sc->
fftDim);
15427 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x = 0.5%s*(sdata[(%" PRIu64
" - 2*sdataID) + (combinedID / %" PRIu64
") * sharedStride].y+sdata[(2*sdataID - %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].y);\n", sc->
regIDs[0], LFending, 2 * sc->
fftDim - 1, mult * sc->
fftDim, sc->
fftDim - 1, mult * sc->
fftDim);
15430 sc->
tempLen = sprintf(sc->
tempStr,
" %s.y = 0.5%s*(-sdata[(%" PRIu64
" - 2*sdataID) + (combinedID / %" PRIu64
") * sharedStride].x+sdata[(2*sdataID - %" PRIu64
") + (combinedID / %" PRIu64
") * sharedStride].x);\n", sc->
regIDs[0], LFending, 2 * sc->
fftDim - 1, mult * sc->
fftDim, sc->
fftDim - 1, mult * sc->
fftDim);
15440 sc->
tempLen = sprintf(sc->
tempStr,
" %s = sdata[(%" PRIu64
" - 2*sdataID) + (combinedID / %" PRIu64
") * sharedStride];\n", sc->
regIDs[0], 2 * sc->
fftDim - 1, sc->
fftDim);
15442 sc->
tempLen = sprintf(sc->
tempStr,
" %s = sdata[(%" PRIu64
" - 2*sdataID) * sharedStride + (combinedID / %" PRIu64
")];\n", sc->
regIDs[0], 2 * sc->
fftDim - 1, sc->
fftDim);
15446 sc->
tempLen = sprintf(sc->
tempStr,
" if ((((sdataID+1)/2) %% 2) != 0) \n\
15452 sc->
tempLen = sprintf(sc->
tempStr,
" if ((((sdataID)/2) %% 2) != 0) \n\
15461 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x *= 1.41421356237309504880%s;\n", sc->
regIDs[1], LFending);
15465 sc->
tempLen = sprintf(sc->
tempStr,
" %s[inoutID] = %s%s.x%s;\n", outputsStruct, convTypeLeft, sc->
regIDs[1], convTypeRight);
15494 if ((uint64_t)ceil(sc->
size[1] / (
double)mult) % sc->
localSize[0] != 0) {
15501 if ((uint64_t)ceil(sc->
size[1] / (
double)mult) % sc->
localSize[1] != 0) {
15578 char shiftX[500] =
"";
15581 char shiftY[500] =
"";
15584 char shiftY2[500] =
"";
15586 sprintf(shiftY2,
" + consts.workGroupShiftY ");
15638 sc->
tempLen = sprintf(sc->
tempStr,
" if ((((sdataID + 1)/2) %% 2) != 0) \n\
15644 sc->
tempLen = sprintf(sc->
tempStr,
" if ((((sdataID)/2) %% 2) != 0) \n\
15655 sc->
tempLen = sprintf(sc->
tempStr,
" if((sdataID < %" PRIu64
")&&(sdataID >= %" PRIu64
")){\n", sc->
fftDim / 2, sc->
fftDim / 4);
15661 sc->
tempLen = sprintf(sc->
tempStr,
" if ((((sdataID+1)/2) %% 2) != 0) \n\
15667 sc->
tempLen = sprintf(sc->
tempStr,
" if ((((sdataID)/2) %% 2) != 0) \n\
15678 sc->
tempLen = sprintf(sc->
tempStr,
" if((sdataID < %" PRIu64
")&&(sdataID >= %" PRIu64
")){\n", 3 * sc->
fftDim / 4, sc->
fftDim / 2);
15684 sc->
tempLen = sprintf(sc->
tempStr,
" if ((((sdataID+1)/2) %% 2) != 0) \n\
15690 sc->
tempLen = sprintf(sc->
tempStr,
" if ((((sdataID)/2) %% 2) != 0) \n\
15707 sc->
tempLen = sprintf(sc->
tempStr,
" if ((((sdataID+1)/2) %% 2) != 0) \n\
15713 sc->
tempLen = sprintf(sc->
tempStr,
" if ((((sdataID)/2) %% 2) != 0) \n\
15722 sc->
tempLen = sprintf(sc->
tempStr,
" %s.x *= 1.41421356237309504880%s;\n", sc->
regIDs[1], LFending);
15726 sc->
tempLen = sprintf(sc->
tempStr,
" %s[inoutID] = %s%s.x%s;\n", outputsStruct, convTypeLeft, sc->
regIDs[1], convTypeRight);
15773 char vecTypeInput[30];
15774 char vecTypeOutput[30];
15775 char inputsStruct[20] =
"";
15776 char outputsStruct[20] =
"";
15777 char LFending[4] =
"";
15778 if (!strcmp(floatType,
"float")) sprintf(LFending,
"f");
15779#if(VKFFT_BACKEND==0)
15781 sprintf(inputsStruct,
"inputs");
15783 sprintf(inputsStruct,
".inputs");
15785 sprintf(outputsStruct,
"outputs");
15787 sprintf(outputsStruct,
".outputs");
15788 if (!strcmp(floatType,
"half")) sprintf(vecType,
"f16vec2");
15789 if (!strcmp(floatType,
"float")) sprintf(vecType,
"vec2");
15790 if (!strcmp(floatType,
"double")) sprintf(vecType,
"dvec2");
15791 if (!strcmp(floatTypeInputMemory,
"half")) sprintf(vecTypeInput,
"f16vec2");
15792 if (!strcmp(floatTypeInputMemory,
"float")) sprintf(vecTypeInput,
"vec2");
15793 if (!strcmp(floatTypeInputMemory,
"double")) sprintf(vecTypeInput,
"dvec2");
15794 if (!strcmp(floatTypeOutputMemory,
"half")) sprintf(vecTypeOutput,
"f16vec2");
15795 if (!strcmp(floatTypeOutputMemory,
"float")) sprintf(vecTypeOutput,
"vec2");
15796 if (!strcmp(floatTypeOutputMemory,
"double")) sprintf(vecTypeOutput,
"dvec2");
15809 if (!strcmp(floatType,
"double")) sprintf(LFending,
"LF");
15810 char cosDef[20] =
"cos";
15811 char sinDef[20] =
"sin";
15812#elif(VKFFT_BACKEND==1)
15813 sprintf(inputsStruct,
"inputs");
15814 sprintf(outputsStruct,
"outputs");
15815 if (!strcmp(floatType,
"half")) sprintf(vecType,
"f16vec2");
15816 if (!strcmp(floatType,
"float")) sprintf(vecType,
"float2");
15817 if (!strcmp(floatType,
"double")) sprintf(vecType,
"double2");
15818 if (!strcmp(floatTypeInputMemory,
"half")) sprintf(vecTypeInput,
"f16vec2");
15819 if (!strcmp(floatTypeInputMemory,
"float")) sprintf(vecTypeInput,
"float2");
15820 if (!strcmp(floatTypeInputMemory,
"double")) sprintf(vecTypeInput,
"double2");
15821 if (!strcmp(floatTypeOutputMemory,
"half")) sprintf(vecTypeOutput,
"f16vec2");
15822 if (!strcmp(floatTypeOutputMemory,
"float")) sprintf(vecTypeOutput,
"float2");
15823 if (!strcmp(floatTypeOutputMemory,
"double")) sprintf(vecTypeOutput,
"double2");
15836 if (!strcmp(floatType,
"double")) sprintf(LFending,
"l");
15837 char cosDef[20] =
"__cosf";
15838 char sinDef[20] =
"__sinf";
15839#elif(VKFFT_BACKEND==2)
15840 sprintf(inputsStruct,
"inputs");
15841 sprintf(outputsStruct,
"outputs");
15842 if (!strcmp(floatType,
"half")) sprintf(vecType,
"f16vec2");
15843 if (!strcmp(floatType,
"float")) sprintf(vecType,
"float2");
15844 if (!strcmp(floatType,
"double")) sprintf(vecType,
"double2");
15845 if (!strcmp(floatTypeInputMemory,
"half")) sprintf(vecTypeInput,
"f16vec2");
15846 if (!strcmp(floatTypeInputMemory,
"float")) sprintf(vecTypeInput,
"float2");
15847 if (!strcmp(floatTypeInputMemory,
"double")) sprintf(vecTypeInput,
"double2");
15848 if (!strcmp(floatTypeOutputMemory,
"half")) sprintf(vecTypeOutput,
"f16vec2");
15849 if (!strcmp(floatTypeOutputMemory,
"float")) sprintf(vecTypeOutput,
"float2");
15850 if (!strcmp(floatTypeOutputMemory,
"double")) sprintf(vecTypeOutput,
"double2");
15863 if (!strcmp(floatType,
"double")) sprintf(LFending,
"l");
15864 char cosDef[20] =
"__cosf";
15865 char sinDef[20] =
"__sinf";
15866#elif(VKFFT_BACKEND==3)
15867 sprintf(inputsStruct,
"inputs");
15868 sprintf(outputsStruct,
"outputs");
15869 if (!strcmp(floatType,
"half")) sprintf(vecType,
"f16vec2");
15870 if (!strcmp(floatType,
"float")) sprintf(vecType,
"float2");
15871 if (!strcmp(floatType,
"double")) sprintf(vecType,
"double2");
15872 if (!strcmp(floatTypeInputMemory,
"half")) sprintf(vecTypeInput,
"f16vec2");
15873 if (!strcmp(floatTypeInputMemory,
"float")) sprintf(vecTypeInput,
"float2");
15874 if (!strcmp(floatTypeInputMemory,
"double")) sprintf(vecTypeInput,
"double2");
15875 if (!strcmp(floatTypeOutputMemory,
"half")) sprintf(vecTypeOutput,
"f16vec2");
15876 if (!strcmp(floatTypeOutputMemory,
"float")) sprintf(vecTypeOutput,
"float2");
15877 if (!strcmp(floatTypeOutputMemory,
"double")) sprintf(vecTypeOutput,
"double2");
15891 char cosDef[20] =
"native_cos";
15892 char sinDef[20] =
"native_sin";
15896 sprintf(sc->
tshuffle,
"tshuffle");
15899 sprintf(sc->
inoutID,
"inoutID");
15900 sprintf(sc->
sdataID,
"sdataID");
15902 char convTypeLeftInput[20] =
"";
15903 char convTypeRightInput[20] =
"";
15904 if ((!strcmp(floatType,
"float")) && (strcmp(floatTypeInputMemory,
"float"))) {
15905#if(VKFFT_BACKEND==0)
15906 sprintf(convTypeLeftInput,
"vec2(");
15907 sprintf(convTypeRightInput,
")");
15908#elif(VKFFT_BACKEND==1)
15909 sprintf(convTypeLeftInput,
"conv_float2(");
15910 sprintf(convTypeRightInput,
")");
15911#elif(VKFFT_BACKEND==2)
15912 sprintf(convTypeLeftInput,
"conv_float2(");
15913 sprintf(convTypeRightInput,
")");
15914#elif(VKFFT_BACKEND==3)
15915 sprintf(convTypeLeftInput,
"conv_float2(");
15916 sprintf(convTypeRightInput,
")");
15919 if ((!strcmp(floatType,
"double")) && (strcmp(floatTypeInputMemory,
"double"))) {
15920#if(VKFFT_BACKEND==0)
15921 sprintf(convTypeLeftInput,
"dvec2(");
15922 sprintf(convTypeRightInput,
")");
15923#elif(VKFFT_BACKEND==1)
15924 sprintf(convTypeLeftInput,
"conv_double2(");
15925 sprintf(convTypeRightInput,
")");
15926#elif(VKFFT_BACKEND==2)
15927 sprintf(convTypeLeftInput,
"conv_double2(");
15928 sprintf(convTypeRightInput,
")");
15929#elif(VKFFT_BACKEND==3)
15930 sprintf(convTypeLeftInput,
"conv_double2(");
15931 sprintf(convTypeRightInput,
")");
15935 char convTypeLeftOutput[20] =
"";
15936 char convTypeRightOutput[20] =
"";
15937 if ((!strcmp(floatTypeOutputMemory,
"half")) && (strcmp(floatType,
"half"))) {
15938 sprintf(convTypeLeftOutput,
"f16vec2(");
15939 sprintf(convTypeRightOutput,
")");
15941 if ((!strcmp(floatTypeOutputMemory,
"float")) && (strcmp(floatType,
"float"))) {
15942#if(VKFFT_BACKEND==0)
15943 sprintf(convTypeLeftOutput,
"vec2(");
15944 sprintf(convTypeRightOutput,
")");
15945#elif(VKFFT_BACKEND==1)
15946 sprintf(convTypeLeftOutput,
"(float2)");
15947#elif(VKFFT_BACKEND==2)
15948 sprintf(convTypeLeftOutput,
"(float2)");
15949#elif(VKFFT_BACKEND==3)
15950 sprintf(convTypeLeftOutput,
"conv_float2(");
15951 sprintf(convTypeRightOutput,
")");
15954 if ((!strcmp(floatTypeOutputMemory,
"double")) && (strcmp(floatType,
"double"))) {
15955#if(VKFFT_BACKEND==0)
15956 sprintf(convTypeLeftOutput,
"dvec2(");
15957 sprintf(convTypeRightOutput,
")");
15958#elif(VKFFT_BACKEND==1)
15959 sprintf(convTypeLeftOutput,
"(double2)");
15960#elif(VKFFT_BACKEND==2)
15961 sprintf(convTypeLeftOutput,
"(double2)");
15962#elif(VKFFT_BACKEND==3)
15963 sprintf(convTypeLeftOutput,
"conv_double2(");
15964 sprintf(convTypeRightOutput,
")");
15970 res =
appendExtensions(sc, floatType, floatTypeInputMemory, floatTypeOutputMemory, floatTypeKernelMemory);
15976 if ((!sc->
LUT) && (!strcmp(floatType,
"double"))) {
15980 if (strcmp(floatType, floatTypeInputMemory)) {
15984 if (strcmp(floatType, floatTypeOutputMemory) && strcmp(floatTypeInputMemory, floatTypeOutputMemory)) {
16016#if(VKFFT_BACKEND==0)
16020#elif(VKFFT_BACKEND==1)
16024 sc->
tempLen = sprintf(sc->
tempStr,
"(%s* inputs, %s* outputs", vecTypeInput, vecTypeOutput);
16028 sc->
tempLen = sprintf(sc->
tempStr,
", %s* kernel_obj", vecType);
16033 sc->
tempLen = sprintf(sc->
tempStr,
", %s* twiddleLUT", vecType);
16041#elif(VKFFT_BACKEND==2)
16045 sc->
tempLen = sprintf(sc->
tempStr,
"(%s* inputs, %s* outputs", vecTypeInput, vecTypeOutput);
16049 sc->
tempLen = sprintf(sc->
tempStr,
", %s* kernel_obj", vecType);
16054 sc->
tempLen = sprintf(sc->
tempStr,
", %s* twiddleLUT", vecType);
16062#elif(VKFFT_BACKEND==3)
16063 sc->
tempLen = sprintf(sc->
tempStr,
"__kernel __attribute__((reqd_work_group_size(%" PRIu64
", %" PRIu64
", %" PRIu64
"))) void VkFFT_main_R2C ", sc->
localSize[0], sc->
localSize[1], sc->
localSize[2]);
16066 sc->
tempLen = sprintf(sc->
tempStr,
"(__global %s* inputs, __global %s* outputs", vecTypeInput, vecTypeOutput);
16070 sc->
tempLen = sprintf(sc->
tempStr,
", __global %s* kernel_obj", vecType);
16075 sc->
tempLen = sprintf(sc->
tempStr,
", __global %s* twiddleLUT", vecType);
16087 char index_x[2000] =
"";
16088 char idX[500] =
"";
16095 sc->
tempLen = sprintf(sc->
tempStr,
"%s id_x = %s %% %" PRIu64
";\n", uintType, idX, (uint64_t)ceil(sc->
size[0] / 4.0));
16098 sc->
tempLen = sprintf(sc->
tempStr,
"%s id_y = (%s / %" PRIu64
") %% %" PRIu64
";\n", uintType, idX, (uint64_t)ceil(sc->
size[0] / 4.0), sc->
size[1]);
16101 sc->
tempLen = sprintf(sc->
tempStr,
"%s id_z = (%s / %" PRIu64
") / %" PRIu64
";\n", uintType, idX, (uint64_t)ceil(sc->
size[0] / 4.0), sc->
size[1]);
16104 sc->
tempLen = sprintf(sc->
tempStr,
"if (%s < %" PRIu64
"){\n", idX, (uint64_t)ceil(sc->
size[0] / 4.0) * sc->
size[1] * sc->
size[2]);
16125 sc->
tempLen = sprintf(sc->
tempStr,
" %s t0 = %s%s[inoutID]%s;\n", vecType, convTypeLeftInput, inputsStruct, convTypeRightInput);
16133 if (sc->
size[0] % 4 == 0) {
16141 sprintf(index_x,
"%" PRIu64
" + id_y*%" PRIu64
" +id_z*%" PRIu64
"", (sc->
size[0] / 2), sc->
inputStride[1], sc->
inputStride[2]);
16151 sprintf(index_x,
"%" PRIu64
" + id_y*%" PRIu64
" +id_z*%" PRIu64
"", (uint64_t)ceil(sc->
size[0] / 4.0), sc->
inputStride[1], sc->
inputStride[2]);
16158 sc->
tempLen = sprintf(sc->
tempStr,
" tf = %s%s[inoutID3]%s;\n", convTypeLeftInput, inputsStruct, convTypeRightInput);
16171 sprintf(index_x,
"(%" PRIu64
"-id_x) + id_y*%" PRIu64
" +id_z*%" PRIu64
"", (sc->
size[0] / 2), sc->
inputStride[1], sc->
inputStride[2]);
16186 sprintf(index_x,
"(%" PRIu64
"-id_x) + id_y*%" PRIu64
" +id_z*%" PRIu64
"", (sc->
size[0] / 2), sc->
inputStride[1], sc->
inputStride[2]);
16194 sc->
tempLen = sprintf(sc->
tempStr,
" %s t1 = %s%s[inoutID2]%s;\n", vecType, convTypeLeftInput, inputsStruct, convTypeRightInput);
16209 if (sc->
size[0] % 4 == 0) {
16240 sc->
tempLen = sprintf(sc->
tempStr,
" %s[inoutID] = %st2%s;\n", outputsStruct, convTypeLeftOutput, convTypeRightOutput);
16247 sc->
tempLen = sprintf(sc->
tempStr,
" %s[inoutID2] = %st3%s;\n", outputsStruct, convTypeLeftOutput, convTypeRightOutput);
16254 sc->
tempLen = sprintf(sc->
tempStr,
" %s[inoutID3] = %stf%s;\n", outputsStruct, convTypeLeftOutput, convTypeRightOutput);
16285 sc->
tempLen = sprintf(sc->
tempStr,
" %s[inoutID] = %st2%s;\n", outputsStruct, convTypeLeftOutput, convTypeRightOutput);
16292 sc->
tempLen = sprintf(sc->
tempStr,
" %s[inoutID2] = %st3%s;\n", outputsStruct, convTypeLeftOutput, convTypeRightOutput);
16313 sc->
tempLen = sprintf(sc->
tempStr,
" tf = twiddleLUT[id_x];\n");
16318 sc->
tempLen = sprintf(sc->
tempStr,
" %s angle = (loc_PI*id_x)/%" PRIu64
";\n", floatType, sc->
size[0] / 2);
16321 if (!strcmp(floatType,
"float")) {
16322 sc->
tempLen = sprintf(sc->
tempStr,
" tf.x = %s(angle);\n", cosDef);
16325 sc->
tempLen = sprintf(sc->
tempStr,
" tf.y = %s(angle);\n", sinDef);
16329 if (!strcmp(floatType,
"double")) {
16330 sc->
tempLen = sprintf(sc->
tempStr,
" tf = sincos_20(angle);\n");
16336 sc->
tempLen = sprintf(sc->
tempStr,
" t0.x = tf.x*t2.y-tf.y*t3.x;\n");
16339 sc->
tempLen = sprintf(sc->
tempStr,
" t0.y = -tf.y*t2.y-tf.x*t3.x;\n");
16356 sc->
tempLen = sprintf(sc->
tempStr,
" t0.x = tf.x*t2.y+tf.y*t3.x;\n");
16359 sc->
tempLen = sprintf(sc->
tempStr,
" t0.y = -tf.y*t2.y+tf.x*t3.x;\n");
16381 sc->
tempLen = sprintf(sc->
tempStr,
" %s[inoutID] = %st0%s;\n", outputsStruct, convTypeLeftOutput, convTypeRightOutput);
16388 sc->
tempLen = sprintf(sc->
tempStr,
" %s[inoutID2] = %st1%s;\n", outputsStruct, convTypeLeftOutput, convTypeRightOutput);
16442 char vecTypeInput[30];
16443 char vecTypeOutput[30];
16444#if(VKFFT_BACKEND==0)
16445 if (!strcmp(floatType,
"half")) sprintf(vecType,
"f16vec2");
16446 if (!strcmp(floatType,
"float")) sprintf(vecType,
"vec2");
16447 if (!strcmp(floatType,
"double")) sprintf(vecType,
"dvec2");
16448 if (!strcmp(floatTypeInputMemory,
"half")) sprintf(vecTypeInput,
"f16vec2");
16449 if (!strcmp(floatTypeInputMemory,
"float")) sprintf(vecTypeInput,
"vec2");
16450 if (!strcmp(floatTypeInputMemory,
"double")) sprintf(vecTypeInput,
"dvec2");
16451 if (!strcmp(floatTypeOutputMemory,
"half")) sprintf(vecTypeOutput,
"f16vec2");
16452 if (!strcmp(floatTypeOutputMemory,
"float")) sprintf(vecTypeOutput,
"vec2");
16453 if (!strcmp(floatTypeOutputMemory,
"double")) sprintf(vecTypeOutput,
"dvec2");
16466#elif(VKFFT_BACKEND==1)
16467 if (!strcmp(floatType,
"half")) sprintf(vecType,
"f16vec2");
16468 if (!strcmp(floatType,
"float")) sprintf(vecType,
"float2");
16469 if (!strcmp(floatType,
"double")) sprintf(vecType,
"double2");
16470 if (!strcmp(floatTypeInputMemory,
"half")) sprintf(vecTypeInput,
"f16vec2");
16471 if (!strcmp(floatTypeInputMemory,
"float")) sprintf(vecTypeInput,
"float2");
16472 if (!strcmp(floatTypeInputMemory,
"double")) sprintf(vecTypeInput,
"double2");
16473 if (!strcmp(floatTypeOutputMemory,
"half")) sprintf(vecTypeOutput,
"f16vec2");
16474 if (!strcmp(floatTypeOutputMemory,
"float")) sprintf(vecTypeOutput,
"float2");
16475 if (!strcmp(floatTypeOutputMemory,
"double")) sprintf(vecTypeOutput,
"double2");
16488#elif(VKFFT_BACKEND==2)
16489 if (!strcmp(floatType,
"half")) sprintf(vecType,
"f16vec2");
16490 if (!strcmp(floatType,
"float")) sprintf(vecType,
"float2");
16491 if (!strcmp(floatType,
"double")) sprintf(vecType,
"double2");
16492 if (!strcmp(floatTypeInputMemory,
"half")) sprintf(vecTypeInput,
"f16vec2");
16493 if (!strcmp(floatTypeInputMemory,
"float")) sprintf(vecTypeInput,
"float2");
16494 if (!strcmp(floatTypeInputMemory,
"double")) sprintf(vecTypeInput,
"double2");
16495 if (!strcmp(floatTypeOutputMemory,
"half")) sprintf(vecTypeOutput,
"f16vec2");
16496 if (!strcmp(floatTypeOutputMemory,
"float")) sprintf(vecTypeOutput,
"float2");
16497 if (!strcmp(floatTypeOutputMemory,
"double")) sprintf(vecTypeOutput,
"double2");
16510#elif(VKFFT_BACKEND==3)
16511 if (!strcmp(floatType,
"half")) sprintf(vecType,
"f16vec2");
16512 if (!strcmp(floatType,
"float")) sprintf(vecType,
"float2");
16513 if (!strcmp(floatType,
"double")) sprintf(vecType,
"double2");
16514 if (!strcmp(floatTypeInputMemory,
"half")) sprintf(vecTypeInput,
"f16vec2");
16515 if (!strcmp(floatTypeInputMemory,
"float")) sprintf(vecTypeInput,
"float2");
16516 if (!strcmp(floatTypeInputMemory,
"double")) sprintf(vecTypeInput,
"double2");
16517 if (!strcmp(floatTypeOutputMemory,
"half")) sprintf(vecTypeOutput,
"f16vec2");
16518 if (!strcmp(floatTypeOutputMemory,
"float")) sprintf(vecTypeOutput,
"float2");
16519 if (!strcmp(floatTypeOutputMemory,
"double")) sprintf(vecTypeOutput,
"double2");
16535 sprintf(sc->
tshuffle,
"tshuffle");
16538 sprintf(sc->
inoutID,
"inoutID");
16539 sprintf(sc->
sdataID,
"sdataID");
16558 res =
appendExtensions(sc, floatType, floatTypeInputMemory, floatTypeOutputMemory, floatTypeKernelMemory);
16573 if ((!sc->
LUT) && (!strcmp(floatType,
"double"))) {
16580 if (strcmp(floatType, floatTypeInputMemory)) {
16587 if (strcmp(floatType, floatTypeOutputMemory) && strcmp(floatTypeInputMemory, floatTypeOutputMemory)) {
16648 uint64_t locType = (((type == 0) || (type == 5) || (type == 6) || (type == 110) || (type == 120) || (type == 130) || (type == 140) || (type == 142) || (type == 144)) && (sc->
axisSwapped)) ? 1 : type;
16649#if(VKFFT_BACKEND==0)
16661#elif(VKFFT_BACKEND==1)
16662 sc->
tempLen = sprintf(sc->
tempStr,
"extern __shared__ float shared[];\n");
16677 sc->
tempLen = sprintf(sc->
tempStr,
"(%s* inputs, %s* outputs", floatTypeInputMemory, vecTypeOutput);
16682 sc->
tempLen = sprintf(sc->
tempStr,
"(%s* inputs, %s* outputs", vecTypeInput, floatTypeOutputMemory);
16687 sc->
tempLen = sprintf(sc->
tempStr,
"(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16692 sc->
tempLen = sprintf(sc->
tempStr,
"(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16697 sc->
tempLen = sprintf(sc->
tempStr,
"(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16702 sc->
tempLen = sprintf(sc->
tempStr,
"(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16707 sc->
tempLen = sprintf(sc->
tempStr,
"(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16712 sc->
tempLen = sprintf(sc->
tempStr,
"(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16717 sc->
tempLen = sprintf(sc->
tempStr,
"(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16722 sc->
tempLen = sprintf(sc->
tempStr,
"(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16727 sc->
tempLen = sprintf(sc->
tempStr,
"(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16732 sc->
tempLen = sprintf(sc->
tempStr,
"(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16737 sc->
tempLen = sprintf(sc->
tempStr,
"(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16742 sc->
tempLen = sprintf(sc->
tempStr,
"(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16747 sc->
tempLen = sprintf(sc->
tempStr,
"(%s* inputs, %s* outputs", vecTypeInput, vecTypeOutput);
16758 sc->
tempLen = sprintf(sc->
tempStr,
", %s* kernel_obj", vecType);
16766 sc->
tempLen = sprintf(sc->
tempStr,
", %s* twiddleLUT", vecType);
16774 sc->
tempLen = sprintf(sc->
tempStr,
", %s* BluesteinConvolutionKernel", vecType);
16782 sc->
tempLen = sprintf(sc->
tempStr,
", %s* BluesteinMultiplication", vecType);
16801#elif(VKFFT_BACKEND==2)
16802 sc->
tempLen = sprintf(sc->
tempStr,
"extern __shared__ float shared[];\n");
16817 sc->
tempLen = sprintf(sc->
tempStr,
"(%s* inputs, %s* outputs", floatTypeInputMemory, vecTypeOutput);
16822 sc->
tempLen = sprintf(sc->
tempStr,
"(%s* inputs, %s* outputs", vecTypeInput, floatTypeOutputMemory);
16827 sc->
tempLen = sprintf(sc->
tempStr,
"(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16832 sc->
tempLen = sprintf(sc->
tempStr,
"(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16837 sc->
tempLen = sprintf(sc->
tempStr,
"(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16842 sc->
tempLen = sprintf(sc->
tempStr,
"(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16847 sc->
tempLen = sprintf(sc->
tempStr,
"(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16852 sc->
tempLen = sprintf(sc->
tempStr,
"(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16857 sc->
tempLen = sprintf(sc->
tempStr,
"(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16862 sc->
tempLen = sprintf(sc->
tempStr,
"(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16867 sc->
tempLen = sprintf(sc->
tempStr,
"(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16872 sc->
tempLen = sprintf(sc->
tempStr,
"(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16877 sc->
tempLen = sprintf(sc->
tempStr,
"(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16882 sc->
tempLen = sprintf(sc->
tempStr,
"(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16887 sc->
tempLen = sprintf(sc->
tempStr,
"(%s* inputs, %s* outputs", vecTypeInput, vecTypeOutput);
16897 sc->
tempLen = sprintf(sc->
tempStr,
", %s* kernel_obj", vecType);
16905 sc->
tempLen = sprintf(sc->
tempStr,
", %s* twiddleLUT", vecType);
16913 sc->
tempLen = sprintf(sc->
tempStr,
", %s* BluesteinConvolutionKernel", vecType);
16921 sc->
tempLen = sprintf(sc->
tempStr,
", %s* BluesteinMultiplication", vecType);
16940#elif(VKFFT_BACKEND==3)
16950 sc->
tempLen = sprintf(sc->
tempStr,
"(__global %s* inputs, __global %s* outputs", floatTypeInputMemory, vecTypeOutput);
16955 sc->
tempLen = sprintf(sc->
tempStr,
"(__global %s* inputs, __global %s* outputs", vecTypeInput, floatTypeOutputMemory);
16960 sc->
tempLen = sprintf(sc->
tempStr,
"(__global %s* inputs, __global %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16965 sc->
tempLen = sprintf(sc->
tempStr,
"(__global %s* inputs, __global %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16970 sc->
tempLen = sprintf(sc->
tempStr,
"(__global %s* inputs, __global %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16975 sc->
tempLen = sprintf(sc->
tempStr,
"(__global %s* inputs, __global %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16980 sc->
tempLen = sprintf(sc->
tempStr,
"(__global %s* inputs, __global %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16985 sc->
tempLen = sprintf(sc->
tempStr,
"(__global %s* inputs, __global %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16990 sc->
tempLen = sprintf(sc->
tempStr,
"(__global %s* inputs, __global %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
16995 sc->
tempLen = sprintf(sc->
tempStr,
"(__global %s* inputs, __global %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
17000 sc->
tempLen = sprintf(sc->
tempStr,
"(__global %s* inputs, __global %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
17005 sc->
tempLen = sprintf(sc->
tempStr,
"(__global %s* inputs, __global %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
17010 sc->
tempLen = sprintf(sc->
tempStr,
"(__global %s* inputs, __global %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
17015 sc->
tempLen = sprintf(sc->
tempStr,
"(__global %s* inputs, __global %s* outputs", floatTypeInputMemory, floatTypeOutputMemory);
17020 sc->
tempLen = sprintf(sc->
tempStr,
"(__global %s* inputs, __global %s* outputs", vecTypeInput, vecTypeOutput);
17030 sc->
tempLen = sprintf(sc->
tempStr,
", __global %s* kernel_obj", vecType);
17038 sc->
tempLen = sprintf(sc->
tempStr,
", __global %s* twiddleLUT", vecType);
17046 sc->
tempLen = sprintf(sc->
tempStr,
", __global %s* BluesteinConvolutionKernel", vecType);
17054 sc->
tempLen = sprintf(sc->
tempStr,
", __global %s* BluesteinMultiplication", vecType);
17097 sc->
tempLen = sprintf(sc->
tempStr,
" for (%s coordinate=%" PRIu64
"; coordinate > 0; coordinate--){\n\
17128 uint64_t stageSize = 1;
17129 uint64_t stageSizeSum = 0;
17130 double PI_const = 3.1415926535897932384626433832795;
17131 double stageAngle = (sc->
inverse) ? PI_const : -PI_const;
17132 for (uint64_t i = 0; i < sc->
numStages; i++) {
17154 stageSizeSum += stageSize;
17157 stageSizeSum += stageSize * 2;
17160 stageSizeSum += stageSize * 2;
17163 stageSizeSum += stageSize * 4;
17166 stageSizeSum += stageSize * 6;
17169 stageSizeSum += stageSize * 3;
17172 stageSizeSum += stageSize * 10;
17175 stageSizeSum += stageSize * 12;
17250 stageAngle = PI_const;
17252 for (uint64_t i = 0; i < sc->
numStages; i++) {
17260 stageSizeSum += stageSize;
17263 stageSizeSum += stageSize * 2;
17266 stageSizeSum += stageSize * 2;
17269 stageSizeSum += stageSize * 4;
17272 stageSizeSum += stageSize * 6;
17275 stageSizeSum += stageSize * 3;
17278 stageSizeSum += stageSize * 10;
17281 stageSizeSum += stageSize * 12;
17353#if(VKFFT_BACKEND==0)
17355 VkPhysicalDeviceMemoryProperties memoryProperties = { 0 };
17359 for (uint64_t i = 0; i < memoryProperties.memoryTypeCount; ++i) {
17360 if ((memoryTypeBits & ((uint64_t)1 << i)) && ((memoryProperties.memoryTypes[i].propertyFlags & properties) == properties) && (memoryProperties.memoryHeaps[memoryProperties.memoryTypes[i].heapIndex].size >= memorySize))
17362 memoryTypeIndex[0] = (uint32_t)i;
17370 VkResult res = VK_SUCCESS;
17371 uint32_t queueFamilyIndices;
17372 VkBufferCreateInfo
bufferCreateInfo = { VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO };
17380 VkMemoryRequirements memoryRequirements = { 0 };
17381 vkGetBufferMemoryRequirements(app->
configuration.
device[0], buffer[0], &memoryRequirements);
17382 VkMemoryAllocateInfo
memoryAllocateInfo = { VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO };
17393 VkResult res = VK_SUCCESS;
17395 VkDeviceSize stagingBufferSize = bufferSize;
17396 VkBuffer stagingBuffer = { 0 };
17397 VkDeviceMemory stagingBufferMemory = { 0 };
17398 resFFT =
allocateFFTBuffer(app, &stagingBuffer, &stagingBufferMemory, VK_BUFFER_USAGE_TRANSFER_SRC_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, stagingBufferSize);
17401 res = vkMapMemory(app->
configuration.
device[0], stagingBufferMemory, 0, stagingBufferSize, 0, &data);
17403 memcpy(data, arr, stagingBufferSize);
17409 VkCommandBuffer commandBuffer = { 0 };
17416 VkBufferCopy copyRegion = { 0 };
17417 copyRegion.srcOffset = 0;
17418 copyRegion.dstOffset = 0;
17419 copyRegion.size = stagingBufferSize;
17420 vkCmdCopyBuffer(commandBuffer, stagingBuffer, buffer[0], 1, ©Region);
17421 res = vkEndCommandBuffer(commandBuffer);
17423 VkSubmitInfo
submitInfo = { VK_STRUCTURE_TYPE_SUBMIT_INFO };
17425 submitInfo.pCommandBuffers = &commandBuffer;
17439#if(VKFFT_BACKEND==0)
17441 if (
axis->bufferLUT != 0) {
17443 axis->bufferLUT = 0;
17445 if (
axis->bufferLUTDeviceMemory != 0) {
17447 axis->bufferLUTDeviceMemory = 0;
17450 if (
axis->descriptorPool != 0) {
17452 axis->descriptorPool = 0;
17454 if (
axis->descriptorSetLayout != 0) {
17456 axis->descriptorSetLayout = 0;
17458 if (
axis->pipelineLayout != 0) {
17460 axis->pipelineLayout = 0;
17462 if (
axis->pipeline != 0) {
17464 axis->pipeline = 0;
17466#elif(VKFFT_BACKEND==1)
17467 CUresult res = CUDA_SUCCESS;
17468 cudaError_t res_t = cudaSuccess;
17470 res_t = cudaFree(
axis->bufferLUT);
17471 axis->bufferLUT = 0;
17473 if (
axis->VkFFTModule != 0) {
17474 res = cuModuleUnload(
axis->VkFFTModule);
17475 axis->VkFFTModule = 0;
17477#elif(VKFFT_BACKEND==2)
17478 hipError_t res = hipSuccess;
17480 res = hipFree(
axis->bufferLUT);
17481 axis->bufferLUT = 0;
17483 if (
axis->VkFFTModule != 0) {
17484 res = hipModuleUnload(
axis->VkFFTModule);
17485 axis->VkFFTModule = 0;
17487#elif(VKFFT_BACKEND==3)
17490 res = clReleaseMemObject(
axis->bufferLUT);
17491 axis->bufferLUT = 0;
17493 if (
axis->program != 0) {
17494 res = clReleaseProgram(
axis->program);
17497 if (
axis->kernel != 0) {
17498 res = clReleaseKernel(
axis->kernel);
17504#if(VKFFT_BACKEND==0)
17506 glslang_finalize_process();
17509#elif(VKFFT_BACKEND==1)
17510 CUresult res = CUDA_SUCCESS;
17511 cudaError_t res_t = cudaSuccess;
17513 for (uint64_t i = 0; i < app->
configuration.num_streams; i++) {
17515 res_t = cudaEventDestroy(app->
configuration.stream_event[i]);
17524#elif(VKFFT_BACKEND==2)
17525 hipError_t res_t = hipSuccess;
17527 for (uint64_t i = 0; i < app->
configuration.num_streams; i++) {
17529 res_t = hipEventDestroy(app->
configuration.stream_event[i]);
17542#if(VKFFT_BACKEND==0)
17551#elif(VKFFT_BACKEND==1)
17556#elif(VKFFT_BACKEND==2)
17561#elif(VKFFT_BACKEND==3)
17580#if(VKFFT_BACKEND==0)
17605#elif(VKFFT_BACKEND==1)
17618#elif(VKFFT_BACKEND==2)
17631#elif(VKFFT_BACKEND==3)
17683static inline VkFFTResult VkFFTGetRegistersPerThread(uint64_t* loc_multipliers, uint64_t* registers_per_thread_per_radix, uint64_t* registers_per_thread, uint64_t* min_registers_per_thread, uint64_t* isGoodSequence) {
17684 for (uint64_t i = 0; i < 14; i++) {
17685 registers_per_thread_per_radix[i] = 0;
17687 registers_per_thread[0] = 0;
17688 min_registers_per_thread[0] = -1;
17690 if (loc_multipliers[2] > 0) {
17691 if (loc_multipliers[3] > 0) {
17692 if (loc_multipliers[5] > 0) {
17693 if (loc_multipliers[7] > 0) {
17694 if (loc_multipliers[11] > 0) {
17695 if (loc_multipliers[13] > 0) {
17696 switch (loc_multipliers[2]) {
17698 registers_per_thread_per_radix[2] = 14;
17699 registers_per_thread_per_radix[3] = 15;
17702 registers_per_thread_per_radix[2] = 12;
17703 registers_per_thread_per_radix[3] = 12;
17706 registers_per_thread_per_radix[2] = 12;
17707 registers_per_thread_per_radix[3] = 12;
17710 registers_per_thread_per_radix[2] = 16;
17711 registers_per_thread_per_radix[3] = 12;
17714 registers_per_thread_per_radix[5] = 15;
17715 registers_per_thread_per_radix[7] = 14;
17716 registers_per_thread_per_radix[11] = 11;
17717 registers_per_thread_per_radix[13] = 13;
17720 switch (loc_multipliers[2]) {
17722 registers_per_thread_per_radix[2] = 14;
17723 registers_per_thread_per_radix[3] = 15;
17726 registers_per_thread_per_radix[2] = 12;
17727 registers_per_thread_per_radix[3] = 12;
17730 registers_per_thread_per_radix[2] = 12;
17731 registers_per_thread_per_radix[3] = 12;
17734 registers_per_thread_per_radix[2] = 16;
17735 registers_per_thread_per_radix[3] = 12;
17738 registers_per_thread_per_radix[5] = 15;
17739 registers_per_thread_per_radix[7] = 14;
17740 registers_per_thread_per_radix[11] = 11;
17741 registers_per_thread_per_radix[13] = 0;
17745 if (loc_multipliers[13] > 0) {
17746 switch (loc_multipliers[2]) {
17748 registers_per_thread_per_radix[2] = 14;
17749 registers_per_thread_per_radix[3] = 15;
17752 registers_per_thread_per_radix[2] = 12;
17753 registers_per_thread_per_radix[3] = 12;
17756 registers_per_thread_per_radix[2] = 12;
17757 registers_per_thread_per_radix[3] = 12;
17760 registers_per_thread_per_radix[2] = 16;
17761 registers_per_thread_per_radix[3] = 12;
17764 registers_per_thread_per_radix[5] = 15;
17765 registers_per_thread_per_radix[7] = 14;
17766 registers_per_thread_per_radix[11] = 0;
17767 registers_per_thread_per_radix[13] = 13;
17771 switch (loc_multipliers[2]) {
17773 registers_per_thread_per_radix[2] = 14;
17774 registers_per_thread_per_radix[3] = 15;
17778 registers_per_thread_per_radix[2] = 12;
17779 registers_per_thread_per_radix[3] = 12;
17782 registers_per_thread_per_radix[2] = 12;
17783 registers_per_thread_per_radix[3] = 12;
17786 registers_per_thread_per_radix[2] = 16;
17787 registers_per_thread_per_radix[3] = 12;
17790 registers_per_thread_per_radix[5] = 15;
17791 registers_per_thread_per_radix[7] = 14;
17792 registers_per_thread_per_radix[11] = 0;
17793 registers_per_thread_per_radix[13] = 0;
17798 if (loc_multipliers[11] > 0) {
17799 if (loc_multipliers[13] > 0) {
17800 switch (loc_multipliers[2]) {
17802 registers_per_thread_per_radix[2] = 10;
17803 registers_per_thread_per_radix[3] = 15;
17806 registers_per_thread_per_radix[2] = 12;
17807 registers_per_thread_per_radix[3] = 12;
17810 registers_per_thread_per_radix[2] = 12;
17811 registers_per_thread_per_radix[3] = 12;
17814 registers_per_thread_per_radix[5] = 10;
17815 registers_per_thread_per_radix[7] = 0;
17816 registers_per_thread_per_radix[11] = 11;
17817 registers_per_thread_per_radix[13] = 13;
17820 switch (loc_multipliers[2]) {
17822 registers_per_thread_per_radix[2] = 10;
17823 registers_per_thread_per_radix[3] = 15;
17826 registers_per_thread_per_radix[2] = 12;
17827 registers_per_thread_per_radix[3] = 12;
17830 registers_per_thread_per_radix[2] = 12;
17831 registers_per_thread_per_radix[3] = 12;
17834 registers_per_thread_per_radix[5] = 10;
17835 registers_per_thread_per_radix[7] = 0;
17836 registers_per_thread_per_radix[11] = 11;
17837 registers_per_thread_per_radix[13] = 0;
17841 if (loc_multipliers[13] > 0) {
17842 switch (loc_multipliers[2]) {
17844 registers_per_thread_per_radix[2] = 10;
17845 registers_per_thread_per_radix[3] = 15;
17848 registers_per_thread_per_radix[2] = 12;
17849 registers_per_thread_per_radix[3] = 12;
17852 registers_per_thread_per_radix[2] = 12;
17853 registers_per_thread_per_radix[3] = 12;
17856 registers_per_thread_per_radix[5] = 10;
17857 registers_per_thread_per_radix[7] = 0;
17858 registers_per_thread_per_radix[11] = 0;
17859 registers_per_thread_per_radix[13] = 13;
17862 switch (loc_multipliers[2]) {
17864 registers_per_thread_per_radix[2] = 6;
17865 registers_per_thread_per_radix[3] = 6;
17866 registers_per_thread_per_radix[5] = 5;
17869 registers_per_thread_per_radix[2] = 12;
17870 registers_per_thread_per_radix[3] = 12;
17871 registers_per_thread_per_radix[5] = 10;
17874 registers_per_thread_per_radix[2] = 12;
17875 registers_per_thread_per_radix[3] = 12;
17876 registers_per_thread_per_radix[5] = 10;
17879 registers_per_thread_per_radix[7] = 0;
17880 registers_per_thread_per_radix[11] = 0;
17881 registers_per_thread_per_radix[13] = 0;
17889 if (loc_multipliers[7] > 0) {
17890 if (loc_multipliers[11] > 0) {
17891 if (loc_multipliers[13] > 0) {
17892 switch (loc_multipliers[2]) {
17894 registers_per_thread_per_radix[2] = 22;
17895 registers_per_thread_per_radix[3] = 21;
17896 registers_per_thread_per_radix[5] = 0;
17897 registers_per_thread_per_radix[7] = 21;
17898 registers_per_thread_per_radix[11] = 22;
17899 registers_per_thread_per_radix[13] = 26;
17902 registers_per_thread_per_radix[2] = 12;
17903 registers_per_thread_per_radix[3] = 12;
17904 registers_per_thread_per_radix[5] = 0;
17905 registers_per_thread_per_radix[7] = 14;
17906 registers_per_thread_per_radix[11] = 11;
17907 registers_per_thread_per_radix[13] = 13;
17910 registers_per_thread_per_radix[2] = 12;
17911 registers_per_thread_per_radix[3] = 12;
17912 registers_per_thread_per_radix[5] = 0;
17913 registers_per_thread_per_radix[7] = 14;
17914 registers_per_thread_per_radix[11] = 11;
17915 registers_per_thread_per_radix[13] = 13;
17920 switch (loc_multipliers[2]) {
17922 registers_per_thread_per_radix[2] = 22;
17923 registers_per_thread_per_radix[3] = 21;
17924 registers_per_thread_per_radix[5] = 0;
17925 registers_per_thread_per_radix[7] = 21;
17926 registers_per_thread_per_radix[11] = 22;
17927 registers_per_thread_per_radix[13] = 0;
17930 registers_per_thread_per_radix[2] = 12;
17931 registers_per_thread_per_radix[3] = 12;
17932 registers_per_thread_per_radix[5] = 0;
17933 registers_per_thread_per_radix[7] = 14;
17934 registers_per_thread_per_radix[11] = 11;
17935 registers_per_thread_per_radix[13] = 0;
17938 registers_per_thread_per_radix[2] = 12;
17939 registers_per_thread_per_radix[3] = 12;
17940 registers_per_thread_per_radix[5] = 0;
17941 registers_per_thread_per_radix[7] = 14;
17942 registers_per_thread_per_radix[11] = 11;
17943 registers_per_thread_per_radix[13] = 0;
17949 if (loc_multipliers[13] > 0) {
17950 switch (loc_multipliers[2]) {
17952 registers_per_thread_per_radix[2] = 26;
17953 registers_per_thread_per_radix[3] = 21;
17954 registers_per_thread_per_radix[5] = 0;
17955 registers_per_thread_per_radix[7] = 21;
17956 registers_per_thread_per_radix[11] = 0;
17957 registers_per_thread_per_radix[13] = 26;
17960 registers_per_thread_per_radix[2] = 12;
17961 registers_per_thread_per_radix[3] = 12;
17962 registers_per_thread_per_radix[5] = 0;
17963 registers_per_thread_per_radix[7] = 14;
17964 registers_per_thread_per_radix[11] = 0;
17965 registers_per_thread_per_radix[13] = 13;
17968 registers_per_thread_per_radix[2] = 12;
17969 registers_per_thread_per_radix[3] = 12;
17970 registers_per_thread_per_radix[5] = 0;
17971 registers_per_thread_per_radix[7] = 14;
17972 registers_per_thread_per_radix[11] = 0;
17973 registers_per_thread_per_radix[13] = 13;
17978 switch (loc_multipliers[2]) {
17980 registers_per_thread_per_radix[2] = 6;
17981 registers_per_thread_per_radix[3] = 6;
17982 registers_per_thread_per_radix[5] = 0;
17983 registers_per_thread_per_radix[7] = 7;
17984 registers_per_thread_per_radix[11] = 0;
17985 registers_per_thread_per_radix[13] = 0;
17988 registers_per_thread_per_radix[2] = 6;
17989 registers_per_thread_per_radix[3] = 6;
17990 registers_per_thread_per_radix[5] = 0;
17991 registers_per_thread_per_radix[7] = 7;
17992 registers_per_thread_per_radix[11] = 0;
17993 registers_per_thread_per_radix[13] = 0;
17996 registers_per_thread_per_radix[2] = 8;
17997 registers_per_thread_per_radix[3] = 6;
17998 registers_per_thread_per_radix[5] = 0;
17999 registers_per_thread_per_radix[7] = 7;
18000 registers_per_thread_per_radix[11] = 0;
18001 registers_per_thread_per_radix[13] = 0;
18008 if (loc_multipliers[11] > 0) {
18009 if (loc_multipliers[13] > 0) {
18010 switch (loc_multipliers[2]) {
18012 registers_per_thread_per_radix[2] = 6;
18013 registers_per_thread_per_radix[3] = 6;
18014 registers_per_thread_per_radix[5] = 0;
18015 registers_per_thread_per_radix[7] = 0;
18016 registers_per_thread_per_radix[11] = 11;
18017 registers_per_thread_per_radix[13] = 13;
18020 registers_per_thread_per_radix[2] = 12;
18021 registers_per_thread_per_radix[3] = 12;
18022 registers_per_thread_per_radix[5] = 0;
18023 registers_per_thread_per_radix[7] = 0;
18024 registers_per_thread_per_radix[11] = 11;
18025 registers_per_thread_per_radix[13] = 13;
18028 registers_per_thread_per_radix[2] = 12;
18029 registers_per_thread_per_radix[3] = 12;
18030 registers_per_thread_per_radix[5] = 0;
18031 registers_per_thread_per_radix[7] = 0;
18032 registers_per_thread_per_radix[11] = 11;
18033 registers_per_thread_per_radix[13] = 13;
18038 switch (loc_multipliers[2]) {
18040 registers_per_thread_per_radix[2] = 6;
18041 registers_per_thread_per_radix[3] = 6;
18042 registers_per_thread_per_radix[5] = 0;
18043 registers_per_thread_per_radix[7] = 0;
18044 registers_per_thread_per_radix[11] = 11;
18045 registers_per_thread_per_radix[13] = 0;
18048 registers_per_thread_per_radix[2] = 12;
18049 registers_per_thread_per_radix[3] = 12;
18050 registers_per_thread_per_radix[5] = 0;
18051 registers_per_thread_per_radix[7] = 0;
18052 registers_per_thread_per_radix[11] = 11;
18053 registers_per_thread_per_radix[13] = 0;
18056 registers_per_thread_per_radix[2] = 12;
18057 registers_per_thread_per_radix[3] = 12;
18058 registers_per_thread_per_radix[5] = 0;
18059 registers_per_thread_per_radix[7] = 0;
18060 registers_per_thread_per_radix[11] = 11;
18061 registers_per_thread_per_radix[13] = 0;
18067 if (loc_multipliers[13] > 0) {
18068 switch (loc_multipliers[2]) {
18070 registers_per_thread_per_radix[2] = 6;
18071 registers_per_thread_per_radix[3] = 6;
18072 registers_per_thread_per_radix[5] = 0;
18073 registers_per_thread_per_radix[7] = 0;
18074 registers_per_thread_per_radix[11] = 0;
18075 registers_per_thread_per_radix[13] = 13;
18078 registers_per_thread_per_radix[2] = 12;
18079 registers_per_thread_per_radix[3] = 12;
18080 registers_per_thread_per_radix[5] = 0;
18081 registers_per_thread_per_radix[7] = 0;
18082 registers_per_thread_per_radix[11] = 0;
18083 registers_per_thread_per_radix[13] = 13;
18086 registers_per_thread_per_radix[2] = 12;
18087 registers_per_thread_per_radix[3] = 12;
18088 registers_per_thread_per_radix[5] = 0;
18089 registers_per_thread_per_radix[7] = 0;
18090 registers_per_thread_per_radix[11] = 0;
18091 registers_per_thread_per_radix[13] = 13;
18096 switch (loc_multipliers[2]) {
18098 registers_per_thread_per_radix[2] = 6;
18099 registers_per_thread_per_radix[3] = 6;
18100 registers_per_thread_per_radix[5] = 0;
18101 registers_per_thread_per_radix[7] = 0;
18102 registers_per_thread_per_radix[11] = 0;
18103 registers_per_thread_per_radix[13] = 0;
18106 registers_per_thread_per_radix[2] = 12;
18107 registers_per_thread_per_radix[3] = 12;
18108 registers_per_thread_per_radix[5] = 0;
18109 registers_per_thread_per_radix[7] = 0;
18110 registers_per_thread_per_radix[11] = 0;
18111 registers_per_thread_per_radix[13] = 0;
18114 registers_per_thread_per_radix[2] = 12;
18115 registers_per_thread_per_radix[3] = 12;
18116 registers_per_thread_per_radix[5] = 0;
18117 registers_per_thread_per_radix[7] = 0;
18118 registers_per_thread_per_radix[11] = 0;
18119 registers_per_thread_per_radix[13] = 0;
18128 if (loc_multipliers[5] > 0) {
18129 if (loc_multipliers[7] > 0) {
18130 if (loc_multipliers[11] > 0) {
18131 if (loc_multipliers[13] > 0) {
18132 switch (loc_multipliers[2]) {
18134 registers_per_thread_per_radix[2] = 10;
18135 registers_per_thread_per_radix[3] = 0;
18136 registers_per_thread_per_radix[5] = 10;
18137 registers_per_thread_per_radix[7] = 14;
18138 registers_per_thread_per_radix[11] = 11;
18139 registers_per_thread_per_radix[13] = 13;
18142 registers_per_thread_per_radix[2] = 10;
18143 registers_per_thread_per_radix[3] = 0;
18144 registers_per_thread_per_radix[5] = 10;
18145 registers_per_thread_per_radix[7] = 14;
18146 registers_per_thread_per_radix[11] = 11;
18147 registers_per_thread_per_radix[13] = 13;
18150 registers_per_thread_per_radix[2] = 8;
18151 registers_per_thread_per_radix[3] = 0;
18152 registers_per_thread_per_radix[5] = 10;
18153 registers_per_thread_per_radix[7] = 14;
18154 registers_per_thread_per_radix[11] = 11;
18155 registers_per_thread_per_radix[13] = 13;
18158 registers_per_thread_per_radix[2] = 16;
18159 registers_per_thread_per_radix[3] = 0;
18160 registers_per_thread_per_radix[5] = 10;
18161 registers_per_thread_per_radix[7] = 14;
18162 registers_per_thread_per_radix[11] = 11;
18163 registers_per_thread_per_radix[13] = 13;
18168 switch (loc_multipliers[2]) {
18170 registers_per_thread_per_radix[2] = 10;
18171 registers_per_thread_per_radix[3] = 0;
18172 registers_per_thread_per_radix[5] = 10;
18173 registers_per_thread_per_radix[7] = 14;
18174 registers_per_thread_per_radix[11] = 11;
18175 registers_per_thread_per_radix[13] = 0;
18178 registers_per_thread_per_radix[2] = 10;
18179 registers_per_thread_per_radix[3] = 0;
18180 registers_per_thread_per_radix[5] = 10;
18181 registers_per_thread_per_radix[7] = 14;
18182 registers_per_thread_per_radix[11] = 11;
18183 registers_per_thread_per_radix[13] = 0;
18186 registers_per_thread_per_radix[2] = 8;
18187 registers_per_thread_per_radix[3] = 0;
18188 registers_per_thread_per_radix[5] = 10;
18189 registers_per_thread_per_radix[7] = 14;
18190 registers_per_thread_per_radix[11] = 11;
18191 registers_per_thread_per_radix[13] = 0;
18194 registers_per_thread_per_radix[2] = 16;
18195 registers_per_thread_per_radix[3] = 0;
18196 registers_per_thread_per_radix[5] = 10;
18197 registers_per_thread_per_radix[7] = 14;
18198 registers_per_thread_per_radix[11] = 11;
18199 registers_per_thread_per_radix[13] = 0;
18205 if (loc_multipliers[13] > 0) {
18206 switch (loc_multipliers[2]) {
18208 registers_per_thread_per_radix[2] = 10;
18209 registers_per_thread_per_radix[3] = 0;
18210 registers_per_thread_per_radix[5] = 10;
18211 registers_per_thread_per_radix[7] = 14;
18212 registers_per_thread_per_radix[11] = 0;
18213 registers_per_thread_per_radix[13] = 13;
18216 registers_per_thread_per_radix[2] = 10;
18217 registers_per_thread_per_radix[3] = 0;
18218 registers_per_thread_per_radix[5] = 10;
18219 registers_per_thread_per_radix[7] = 14;
18220 registers_per_thread_per_radix[11] = 0;
18221 registers_per_thread_per_radix[13] = 13;
18224 registers_per_thread_per_radix[2] = 8;
18225 registers_per_thread_per_radix[3] = 0;
18226 registers_per_thread_per_radix[5] = 10;
18227 registers_per_thread_per_radix[7] = 14;
18228 registers_per_thread_per_radix[11] = 0;
18229 registers_per_thread_per_radix[13] = 13;
18232 registers_per_thread_per_radix[2] = 16;
18233 registers_per_thread_per_radix[3] = 0;
18234 registers_per_thread_per_radix[5] = 10;
18235 registers_per_thread_per_radix[7] = 14;
18236 registers_per_thread_per_radix[11] = 0;
18237 registers_per_thread_per_radix[13] = 13;
18242 switch (loc_multipliers[2]) {
18244 registers_per_thread_per_radix[2] = 10;
18245 registers_per_thread_per_radix[3] = 0;
18246 registers_per_thread_per_radix[5] = 10;
18247 registers_per_thread_per_radix[7] = 7;
18248 registers_per_thread_per_radix[11] = 0;
18249 registers_per_thread_per_radix[13] = 0;
18252 registers_per_thread_per_radix[2] = 10;
18253 registers_per_thread_per_radix[3] = 0;
18254 registers_per_thread_per_radix[5] = 10;
18255 registers_per_thread_per_radix[7] = 7;
18256 registers_per_thread_per_radix[11] = 0;
18257 registers_per_thread_per_radix[13] = 0;
18260 registers_per_thread_per_radix[2] = 8;
18261 registers_per_thread_per_radix[3] = 0;
18262 registers_per_thread_per_radix[5] = 10;
18263 registers_per_thread_per_radix[7] = 7;
18264 registers_per_thread_per_radix[11] = 0;
18265 registers_per_thread_per_radix[13] = 0;
18272 if (loc_multipliers[11] > 0) {
18273 if (loc_multipliers[13] > 0) {
18274 switch (loc_multipliers[2]) {
18276 registers_per_thread_per_radix[2] = 10;
18277 registers_per_thread_per_radix[3] = 0;
18278 registers_per_thread_per_radix[5] = 10;
18279 registers_per_thread_per_radix[7] = 0;
18280 registers_per_thread_per_radix[11] = 11;
18281 registers_per_thread_per_radix[13] = 13;
18284 registers_per_thread_per_radix[2] = 10;
18285 registers_per_thread_per_radix[3] = 0;
18286 registers_per_thread_per_radix[5] = 10;
18287 registers_per_thread_per_radix[7] = 0;
18288 registers_per_thread_per_radix[11] = 11;
18289 registers_per_thread_per_radix[13] = 13;
18292 registers_per_thread_per_radix[2] = 8;
18293 registers_per_thread_per_radix[3] = 0;
18294 registers_per_thread_per_radix[5] = 10;
18295 registers_per_thread_per_radix[7] = 0;
18296 registers_per_thread_per_radix[11] = 11;
18297 registers_per_thread_per_radix[13] = 13;
18302 switch (loc_multipliers[2]) {
18304 registers_per_thread_per_radix[2] = 10;
18305 registers_per_thread_per_radix[3] = 0;
18306 registers_per_thread_per_radix[5] = 10;
18307 registers_per_thread_per_radix[7] = 0;
18308 registers_per_thread_per_radix[11] = 11;
18309 registers_per_thread_per_radix[13] = 0;
18312 registers_per_thread_per_radix[2] = 10;
18313 registers_per_thread_per_radix[3] = 0;
18314 registers_per_thread_per_radix[5] = 10;
18315 registers_per_thread_per_radix[7] = 0;
18316 registers_per_thread_per_radix[11] = 11;
18317 registers_per_thread_per_radix[13] = 0;
18320 registers_per_thread_per_radix[2] = 8;
18321 registers_per_thread_per_radix[3] = 0;
18322 registers_per_thread_per_radix[5] = 10;
18323 registers_per_thread_per_radix[7] = 0;
18324 registers_per_thread_per_radix[11] = 11;
18325 registers_per_thread_per_radix[13] = 0;
18331 if (loc_multipliers[13] > 0) {
18332 switch (loc_multipliers[2]) {
18334 registers_per_thread_per_radix[2] = 10;
18335 registers_per_thread_per_radix[3] = 0;
18336 registers_per_thread_per_radix[5] = 10;
18337 registers_per_thread_per_radix[7] = 0;
18338 registers_per_thread_per_radix[11] = 0;
18339 registers_per_thread_per_radix[13] = 13;
18342 registers_per_thread_per_radix[2] = 10;
18343 registers_per_thread_per_radix[3] = 0;
18344 registers_per_thread_per_radix[5] = 10;
18345 registers_per_thread_per_radix[7] = 0;
18346 registers_per_thread_per_radix[11] = 0;
18347 registers_per_thread_per_radix[13] = 13;
18350 registers_per_thread_per_radix[2] = 8;
18351 registers_per_thread_per_radix[3] = 0;
18352 registers_per_thread_per_radix[5] = 10;
18353 registers_per_thread_per_radix[7] = 0;
18354 registers_per_thread_per_radix[11] = 0;
18355 registers_per_thread_per_radix[13] = 13;
18360 switch (loc_multipliers[2]) {
18362 registers_per_thread_per_radix[2] = 10;
18363 registers_per_thread_per_radix[3] = 0;
18364 registers_per_thread_per_radix[5] = 10;
18365 registers_per_thread_per_radix[7] = 0;
18366 registers_per_thread_per_radix[11] = 0;
18367 registers_per_thread_per_radix[13] = 0;
18370 registers_per_thread_per_radix[2] = 10;
18371 registers_per_thread_per_radix[3] = 0;
18372 registers_per_thread_per_radix[5] = 10;
18373 registers_per_thread_per_radix[7] = 0;
18374 registers_per_thread_per_radix[11] = 0;
18375 registers_per_thread_per_radix[13] = 0;
18378 registers_per_thread_per_radix[2] = 8;
18379 registers_per_thread_per_radix[3] = 0;
18380 registers_per_thread_per_radix[5] = 10;
18381 registers_per_thread_per_radix[7] = 0;
18382 registers_per_thread_per_radix[11] = 0;
18383 registers_per_thread_per_radix[13] = 0;
18392 if (loc_multipliers[7] > 0) {
18393 if (loc_multipliers[11] > 0) {
18394 if (loc_multipliers[13] > 0) {
18395 switch (loc_multipliers[2]) {
18397 registers_per_thread_per_radix[2] = 14;
18398 registers_per_thread_per_radix[3] = 0;
18399 registers_per_thread_per_radix[5] = 0;
18400 registers_per_thread_per_radix[7] = 14;
18401 registers_per_thread_per_radix[11] = 11;
18402 registers_per_thread_per_radix[13] = 13;
18405 registers_per_thread_per_radix[2] = 14;
18406 registers_per_thread_per_radix[3] = 0;
18407 registers_per_thread_per_radix[5] = 0;
18408 registers_per_thread_per_radix[7] = 14;
18409 registers_per_thread_per_radix[11] = 11;
18410 registers_per_thread_per_radix[13] = 13;
18413 registers_per_thread_per_radix[2] = 8;
18414 registers_per_thread_per_radix[3] = 0;
18415 registers_per_thread_per_radix[5] = 0;
18416 registers_per_thread_per_radix[7] = 14;
18417 registers_per_thread_per_radix[11] = 11;
18418 registers_per_thread_per_radix[13] = 13;
18421 registers_per_thread_per_radix[2] = 16;
18422 registers_per_thread_per_radix[3] = 0;
18423 registers_per_thread_per_radix[5] = 0;
18424 registers_per_thread_per_radix[7] = 14;
18425 registers_per_thread_per_radix[11] = 11;
18426 registers_per_thread_per_radix[13] = 13;
18431 switch (loc_multipliers[2]) {
18433 registers_per_thread_per_radix[2] = 14;
18434 registers_per_thread_per_radix[3] = 0;
18435 registers_per_thread_per_radix[5] = 0;
18436 registers_per_thread_per_radix[7] = 14;
18437 registers_per_thread_per_radix[11] = 11;
18438 registers_per_thread_per_radix[13] = 0;
18441 registers_per_thread_per_radix[2] = 14;
18442 registers_per_thread_per_radix[3] = 0;
18443 registers_per_thread_per_radix[5] = 0;
18444 registers_per_thread_per_radix[7] = 14;
18445 registers_per_thread_per_radix[11] = 11;
18446 registers_per_thread_per_radix[13] = 0;
18449 registers_per_thread_per_radix[2] = 8;
18450 registers_per_thread_per_radix[3] = 0;
18451 registers_per_thread_per_radix[5] = 0;
18452 registers_per_thread_per_radix[7] = 14;
18453 registers_per_thread_per_radix[11] = 11;
18454 registers_per_thread_per_radix[13] = 0;
18457 registers_per_thread_per_radix[2] = 16;
18458 registers_per_thread_per_radix[3] = 0;
18459 registers_per_thread_per_radix[5] = 0;
18460 registers_per_thread_per_radix[7] = 14;
18461 registers_per_thread_per_radix[11] = 11;
18462 registers_per_thread_per_radix[13] = 0;
18468 if (loc_multipliers[13] > 0) {
18469 switch (loc_multipliers[2]) {
18471 registers_per_thread_per_radix[2] = 14;
18472 registers_per_thread_per_radix[3] = 0;
18473 registers_per_thread_per_radix[5] = 0;
18474 registers_per_thread_per_radix[7] = 14;
18475 registers_per_thread_per_radix[11] = 0;
18476 registers_per_thread_per_radix[13] = 13;
18479 registers_per_thread_per_radix[2] = 14;
18480 registers_per_thread_per_radix[3] = 0;
18481 registers_per_thread_per_radix[5] = 0;
18482 registers_per_thread_per_radix[7] = 14;
18483 registers_per_thread_per_radix[11] = 0;
18484 registers_per_thread_per_radix[13] = 13;
18487 registers_per_thread_per_radix[2] = 8;
18488 registers_per_thread_per_radix[3] = 0;
18489 registers_per_thread_per_radix[5] = 0;
18490 registers_per_thread_per_radix[7] = 14;
18491 registers_per_thread_per_radix[11] = 0;
18492 registers_per_thread_per_radix[13] = 13;
18495 registers_per_thread_per_radix[2] = 16;
18496 registers_per_thread_per_radix[3] = 0;
18497 registers_per_thread_per_radix[5] = 0;
18498 registers_per_thread_per_radix[7] = 14;
18499 registers_per_thread_per_radix[11] = 0;
18500 registers_per_thread_per_radix[13] = 13;
18505 switch (loc_multipliers[2]) {
18507 registers_per_thread_per_radix[2] = 14;
18508 registers_per_thread_per_radix[3] = 0;
18509 registers_per_thread_per_radix[5] = 0;
18510 registers_per_thread_per_radix[7] = 14;
18511 registers_per_thread_per_radix[11] = 0;
18512 registers_per_thread_per_radix[13] = 0;
18515 registers_per_thread_per_radix[2] = 14;
18516 registers_per_thread_per_radix[3] = 0;
18517 registers_per_thread_per_radix[5] = 0;
18518 registers_per_thread_per_radix[7] = 14;
18519 registers_per_thread_per_radix[11] = 0;
18520 registers_per_thread_per_radix[13] = 0;
18523 registers_per_thread_per_radix[2] = 14;
18524 registers_per_thread_per_radix[3] = 0;
18525 registers_per_thread_per_radix[5] = 0;
18526 registers_per_thread_per_radix[7] = 14;
18527 registers_per_thread_per_radix[11] = 0;
18528 registers_per_thread_per_radix[13] = 0;
18531 registers_per_thread_per_radix[2] = 14;
18532 registers_per_thread_per_radix[3] = 0;
18533 registers_per_thread_per_radix[5] = 0;
18534 registers_per_thread_per_radix[7] = 14;
18535 registers_per_thread_per_radix[11] = 0;
18536 registers_per_thread_per_radix[13] = 0;
18543 if (loc_multipliers[11] > 0) {
18544 if (loc_multipliers[13] > 0) {
18545 switch (loc_multipliers[2]) {
18547 registers_per_thread_per_radix[2] = 22;
18548 registers_per_thread_per_radix[3] = 0;
18549 registers_per_thread_per_radix[5] = 0;
18550 registers_per_thread_per_radix[7] = 0;
18551 registers_per_thread_per_radix[11] = 22;
18552 registers_per_thread_per_radix[13] = 26;
18555 registers_per_thread_per_radix[2] = 22;
18556 registers_per_thread_per_radix[3] = 0;
18557 registers_per_thread_per_radix[5] = 0;
18558 registers_per_thread_per_radix[7] = 0;
18559 registers_per_thread_per_radix[11] = 22;
18560 registers_per_thread_per_radix[13] = 26;
18563 registers_per_thread_per_radix[2] = 8;
18564 registers_per_thread_per_radix[3] = 0;
18565 registers_per_thread_per_radix[5] = 0;
18566 registers_per_thread_per_radix[7] = 0;
18567 registers_per_thread_per_radix[11] = 11;
18568 registers_per_thread_per_radix[13] = 13;
18573 switch (loc_multipliers[2]) {
18575 registers_per_thread_per_radix[2] = 22;
18576 registers_per_thread_per_radix[3] = 0;
18577 registers_per_thread_per_radix[5] = 0;
18578 registers_per_thread_per_radix[7] = 0;
18579 registers_per_thread_per_radix[11] = 22;
18580 registers_per_thread_per_radix[13] = 0;
18583 registers_per_thread_per_radix[2] = 22;
18584 registers_per_thread_per_radix[3] = 0;
18585 registers_per_thread_per_radix[5] = 0;
18586 registers_per_thread_per_radix[7] = 0;
18587 registers_per_thread_per_radix[11] = 22;
18588 registers_per_thread_per_radix[13] = 0;
18591 registers_per_thread_per_radix[2] = 8;
18592 registers_per_thread_per_radix[3] = 0;
18593 registers_per_thread_per_radix[5] = 0;
18594 registers_per_thread_per_radix[7] = 0;
18595 registers_per_thread_per_radix[11] = 11;
18596 registers_per_thread_per_radix[13] = 0;
18599 registers_per_thread_per_radix[2] = 8;
18600 registers_per_thread_per_radix[3] = 0;
18601 registers_per_thread_per_radix[5] = 0;
18602 registers_per_thread_per_radix[7] = 0;
18603 registers_per_thread_per_radix[11] = 11;
18604 registers_per_thread_per_radix[13] = 0;
18610 if (loc_multipliers[13] > 0) {
18611 switch (loc_multipliers[2]) {
18613 registers_per_thread_per_radix[2] = 26;
18614 registers_per_thread_per_radix[3] = 0;
18615 registers_per_thread_per_radix[5] = 0;
18616 registers_per_thread_per_radix[7] = 0;
18617 registers_per_thread_per_radix[11] = 0;
18618 registers_per_thread_per_radix[13] = 26;
18621 registers_per_thread_per_radix[2] = 26;
18622 registers_per_thread_per_radix[3] = 0;
18623 registers_per_thread_per_radix[5] = 0;
18624 registers_per_thread_per_radix[7] = 0;
18625 registers_per_thread_per_radix[11] = 0;
18626 registers_per_thread_per_radix[13] = 26;
18629 registers_per_thread_per_radix[2] = 8;
18630 registers_per_thread_per_radix[3] = 0;
18631 registers_per_thread_per_radix[5] = 0;
18632 registers_per_thread_per_radix[7] = 0;
18633 registers_per_thread_per_radix[11] = 0;
18634 registers_per_thread_per_radix[13] = 13;
18639 registers_per_thread_per_radix[2] = (loc_multipliers[2] > 2) ? 8 : (uint64_t)
pow(2, loc_multipliers[2]);
18640 registers_per_thread_per_radix[3] = 0;
18641 registers_per_thread_per_radix[5] = 0;
18642 registers_per_thread_per_radix[7] = 0;
18643 registers_per_thread_per_radix[11] = 0;
18644 registers_per_thread_per_radix[13] = 0;
18652 if (loc_multipliers[3] > 0) {
18653 if (loc_multipliers[5] > 0) {
18654 if (loc_multipliers[7] > 0) {
18655 if (loc_multipliers[11] > 0) {
18656 if (loc_multipliers[13] > 0) {
18657 registers_per_thread_per_radix[2] = 0;
18658 registers_per_thread_per_radix[3] = 15;
18659 registers_per_thread_per_radix[5] = 15;
18660 registers_per_thread_per_radix[7] = 21;
18661 registers_per_thread_per_radix[11] = 11;
18662 registers_per_thread_per_radix[13] = 13;
18665 registers_per_thread_per_radix[2] = 0;
18666 registers_per_thread_per_radix[3] = 15;
18667 registers_per_thread_per_radix[5] = 15;
18668 registers_per_thread_per_radix[7] = 21;
18669 registers_per_thread_per_radix[11] = 11;
18670 registers_per_thread_per_radix[13] = 0;
18674 if (loc_multipliers[13] > 0) {
18675 registers_per_thread_per_radix[2] = 0;
18676 registers_per_thread_per_radix[3] = 15;
18677 registers_per_thread_per_radix[5] = 15;
18678 registers_per_thread_per_radix[7] = 21;
18679 registers_per_thread_per_radix[11] = 0;
18680 registers_per_thread_per_radix[13] = 13;
18683 registers_per_thread_per_radix[2] = 0;
18684 registers_per_thread_per_radix[3] = 15;
18685 registers_per_thread_per_radix[5] = 15;
18686 registers_per_thread_per_radix[7] = 21;
18687 registers_per_thread_per_radix[11] = 0;
18688 registers_per_thread_per_radix[13] = 0;
18693 if (loc_multipliers[11] > 0) {
18694 if (loc_multipliers[13] > 0) {
18695 registers_per_thread_per_radix[2] = 0;
18696 registers_per_thread_per_radix[3] = 15;
18697 registers_per_thread_per_radix[5] = 15;
18698 registers_per_thread_per_radix[7] = 0;
18699 registers_per_thread_per_radix[11] = 11;
18700 registers_per_thread_per_radix[13] = 13;
18703 registers_per_thread_per_radix[2] = 0;
18704 registers_per_thread_per_radix[3] = 15;
18705 registers_per_thread_per_radix[5] = 15;
18706 registers_per_thread_per_radix[7] = 0;
18707 registers_per_thread_per_radix[11] = 11;
18708 registers_per_thread_per_radix[13] = 0;
18712 if (loc_multipliers[13] > 0) {
18713 registers_per_thread_per_radix[2] = 0;
18714 registers_per_thread_per_radix[3] = 15;
18715 registers_per_thread_per_radix[5] = 15;
18716 registers_per_thread_per_radix[7] = 0;
18717 registers_per_thread_per_radix[11] = 0;
18718 registers_per_thread_per_radix[13] = 13;
18721 registers_per_thread_per_radix[2] = 0;
18722 registers_per_thread_per_radix[3] = 15;
18723 registers_per_thread_per_radix[5] = 15;
18724 registers_per_thread_per_radix[7] = 0;
18725 registers_per_thread_per_radix[11] = 0;
18726 registers_per_thread_per_radix[13] = 0;
18733 if (loc_multipliers[7] > 0) {
18734 if (loc_multipliers[3] == 1) {
18735 if (loc_multipliers[11] > 0) {
18736 if (loc_multipliers[13] > 0) {
18737 registers_per_thread_per_radix[2] = 0;
18738 registers_per_thread_per_radix[3] = 21;
18739 registers_per_thread_per_radix[5] = 0;
18740 registers_per_thread_per_radix[7] = 21;
18741 registers_per_thread_per_radix[11] = 11;
18742 registers_per_thread_per_radix[13] = 13;
18745 registers_per_thread_per_radix[2] = 0;
18746 registers_per_thread_per_radix[3] = 21;
18747 registers_per_thread_per_radix[5] = 0;
18748 registers_per_thread_per_radix[7] = 21;
18749 registers_per_thread_per_radix[11] = 11;
18750 registers_per_thread_per_radix[13] = 0;
18754 if (loc_multipliers[13] > 0) {
18755 registers_per_thread_per_radix[2] = 0;
18756 registers_per_thread_per_radix[3] = 21;
18757 registers_per_thread_per_radix[5] = 0;
18758 registers_per_thread_per_radix[7] = 21;
18759 registers_per_thread_per_radix[11] = 0;
18760 registers_per_thread_per_radix[13] = 13;
18763 registers_per_thread_per_radix[2] = 0;
18764 registers_per_thread_per_radix[3] = 21;
18765 registers_per_thread_per_radix[5] = 0;
18766 registers_per_thread_per_radix[7] = 21;
18767 registers_per_thread_per_radix[11] = 0;
18768 registers_per_thread_per_radix[13] = 0;
18773 if (loc_multipliers[11] > 0) {
18774 if (loc_multipliers[13] > 0) {
18775 registers_per_thread_per_radix[2] = 0;
18776 registers_per_thread_per_radix[3] = 9;
18777 registers_per_thread_per_radix[5] = 0;
18778 registers_per_thread_per_radix[7] = 7;
18779 registers_per_thread_per_radix[11] = 11;
18780 registers_per_thread_per_radix[13] = 13;
18783 registers_per_thread_per_radix[2] = 0;
18784 registers_per_thread_per_radix[3] = 9;
18785 registers_per_thread_per_radix[5] = 0;
18786 registers_per_thread_per_radix[7] = 7;
18787 registers_per_thread_per_radix[11] = 11;
18788 registers_per_thread_per_radix[13] = 0;
18792 if (loc_multipliers[13] > 0) {
18793 registers_per_thread_per_radix[2] = 0;
18794 registers_per_thread_per_radix[3] = 9;
18795 registers_per_thread_per_radix[5] = 0;
18796 registers_per_thread_per_radix[7] = 7;
18797 registers_per_thread_per_radix[11] = 0;
18798 registers_per_thread_per_radix[13] = 13;
18801 registers_per_thread_per_radix[2] = 0;
18802 registers_per_thread_per_radix[3] = 9;
18803 registers_per_thread_per_radix[5] = 0;
18804 registers_per_thread_per_radix[7] = 7;
18805 registers_per_thread_per_radix[11] = 0;
18806 registers_per_thread_per_radix[13] = 0;
18812 if (loc_multipliers[3] == 1) {
18813 if (loc_multipliers[11] > 0) {
18814 if (loc_multipliers[13] > 0) {
18815 registers_per_thread_per_radix[2] = 0;
18816 registers_per_thread_per_radix[3] = 33;
18817 registers_per_thread_per_radix[5] = 0;
18818 registers_per_thread_per_radix[7] = 0;
18819 registers_per_thread_per_radix[11] = 33;
18820 registers_per_thread_per_radix[13] = 39;
18823 registers_per_thread_per_radix[2] = 0;
18824 registers_per_thread_per_radix[3] = 33;
18825 registers_per_thread_per_radix[5] = 0;
18826 registers_per_thread_per_radix[7] = 0;
18827 registers_per_thread_per_radix[11] = 33;
18828 registers_per_thread_per_radix[13] = 0;
18832 if (loc_multipliers[13] > 0) {
18833 registers_per_thread_per_radix[2] = 0;
18834 registers_per_thread_per_radix[3] = 39;
18835 registers_per_thread_per_radix[5] = 0;
18836 registers_per_thread_per_radix[7] = 0;
18837 registers_per_thread_per_radix[11] = 0;
18838 registers_per_thread_per_radix[13] = 39;
18841 registers_per_thread_per_radix[2] = 0;
18842 registers_per_thread_per_radix[3] = 3;
18843 registers_per_thread_per_radix[5] = 0;
18844 registers_per_thread_per_radix[7] = 0;
18845 registers_per_thread_per_radix[11] = 0;
18846 registers_per_thread_per_radix[13] = 0;
18851 if (loc_multipliers[11] > 0) {
18852 if (loc_multipliers[13] > 0) {
18853 registers_per_thread_per_radix[2] = 0;
18854 registers_per_thread_per_radix[3] = 9;
18855 registers_per_thread_per_radix[5] = 0;
18856 registers_per_thread_per_radix[7] = 0;
18857 registers_per_thread_per_radix[11] = 11;
18858 registers_per_thread_per_radix[13] = 13;
18861 registers_per_thread_per_radix[2] = 0;
18862 registers_per_thread_per_radix[3] = 9;
18863 registers_per_thread_per_radix[5] = 0;
18864 registers_per_thread_per_radix[7] = 0;
18865 registers_per_thread_per_radix[11] = 11;
18866 registers_per_thread_per_radix[13] = 0;
18870 if (loc_multipliers[13] > 0) {
18871 registers_per_thread_per_radix[2] = 0;
18872 registers_per_thread_per_radix[3] = 9;
18873 registers_per_thread_per_radix[5] = 0;
18874 registers_per_thread_per_radix[7] = 0;
18875 registers_per_thread_per_radix[11] = 0;
18876 registers_per_thread_per_radix[13] = 13;
18879 registers_per_thread_per_radix[2] = 0;
18880 registers_per_thread_per_radix[3] = 9;
18881 registers_per_thread_per_radix[5] = 0;
18882 registers_per_thread_per_radix[7] = 0;
18883 registers_per_thread_per_radix[11] = 0;
18884 registers_per_thread_per_radix[13] = 0;
18892 if (loc_multipliers[5] > 0) {
18893 if (loc_multipliers[7] > 0) {
18894 if (loc_multipliers[11] > 0) {
18895 if (loc_multipliers[13] > 0) {
18896 registers_per_thread_per_radix[2] = 0;
18897 registers_per_thread_per_radix[3] = 0;
18898 registers_per_thread_per_radix[5] = 5;
18899 registers_per_thread_per_radix[7] = 7;
18900 registers_per_thread_per_radix[11] = 11;
18901 registers_per_thread_per_radix[13] = 13;
18904 registers_per_thread_per_radix[2] = 0;
18905 registers_per_thread_per_radix[3] = 0;
18906 registers_per_thread_per_radix[5] = 5;
18907 registers_per_thread_per_radix[7] = 7;
18908 registers_per_thread_per_radix[11] = 11;
18909 registers_per_thread_per_radix[13] = 0;
18913 if (loc_multipliers[13] > 0) {
18914 registers_per_thread_per_radix[2] = 0;
18915 registers_per_thread_per_radix[3] = 0;
18916 registers_per_thread_per_radix[5] = 5;
18917 registers_per_thread_per_radix[7] = 7;
18918 registers_per_thread_per_radix[11] = 0;
18919 registers_per_thread_per_radix[13] = 13;
18922 registers_per_thread_per_radix[2] = 0;
18923 registers_per_thread_per_radix[3] = 0;
18924 registers_per_thread_per_radix[5] = 5;
18925 registers_per_thread_per_radix[7] = 7;
18926 registers_per_thread_per_radix[11] = 0;
18927 registers_per_thread_per_radix[13] = 0;
18932 if (loc_multipliers[11] > 0) {
18933 if (loc_multipliers[13] > 0) {
18934 registers_per_thread_per_radix[2] = 0;
18935 registers_per_thread_per_radix[3] = 0;
18936 registers_per_thread_per_radix[5] = 5;
18937 registers_per_thread_per_radix[7] = 0;
18938 registers_per_thread_per_radix[11] = 11;
18939 registers_per_thread_per_radix[13] = 13;
18942 registers_per_thread_per_radix[2] = 0;
18943 registers_per_thread_per_radix[3] = 0;
18944 registers_per_thread_per_radix[5] = 5;
18945 registers_per_thread_per_radix[7] = 0;
18946 registers_per_thread_per_radix[11] = 11;
18947 registers_per_thread_per_radix[13] = 0;
18951 if (loc_multipliers[13] > 0) {
18952 registers_per_thread_per_radix[2] = 0;
18953 registers_per_thread_per_radix[3] = 0;
18954 registers_per_thread_per_radix[5] = 5;
18955 registers_per_thread_per_radix[7] = 0;
18956 registers_per_thread_per_radix[11] = 0;
18957 registers_per_thread_per_radix[13] = 13;
18960 registers_per_thread_per_radix[2] = 0;
18961 registers_per_thread_per_radix[3] = 0;
18962 registers_per_thread_per_radix[5] = 5;
18963 registers_per_thread_per_radix[7] = 0;
18964 registers_per_thread_per_radix[11] = 0;
18965 registers_per_thread_per_radix[13] = 0;
18972 if (loc_multipliers[7] > 0) {
18973 if (loc_multipliers[11] > 0) {
18974 if (loc_multipliers[13] > 0) {
18975 registers_per_thread_per_radix[2] = 0;
18976 registers_per_thread_per_radix[3] = 0;
18977 registers_per_thread_per_radix[5] = 0;
18978 registers_per_thread_per_radix[7] = 7;
18979 registers_per_thread_per_radix[11] = 11;
18980 registers_per_thread_per_radix[13] = 13;
18983 registers_per_thread_per_radix[2] = 0;
18984 registers_per_thread_per_radix[3] = 0;
18985 registers_per_thread_per_radix[5] = 0;
18986 registers_per_thread_per_radix[7] = 7;
18987 registers_per_thread_per_radix[11] = 11;
18988 registers_per_thread_per_radix[13] = 0;
18992 if (loc_multipliers[13] > 0) {
18993 registers_per_thread_per_radix[2] = 0;
18994 registers_per_thread_per_radix[3] = 0;
18995 registers_per_thread_per_radix[5] = 0;
18996 registers_per_thread_per_radix[7] = 7;
18997 registers_per_thread_per_radix[11] = 0;
18998 registers_per_thread_per_radix[13] = 13;
19001 registers_per_thread_per_radix[2] = 0;
19002 registers_per_thread_per_radix[3] = 0;
19003 registers_per_thread_per_radix[5] = 0;
19004 registers_per_thread_per_radix[7] = 7;
19005 registers_per_thread_per_radix[11] = 0;
19006 registers_per_thread_per_radix[13] = 0;
19011 if (loc_multipliers[11] > 0) {
19012 if (loc_multipliers[13] > 0) {
19013 registers_per_thread_per_radix[2] = 0;
19014 registers_per_thread_per_radix[3] = 0;
19015 registers_per_thread_per_radix[5] = 0;
19016 registers_per_thread_per_radix[7] = 0;
19017 registers_per_thread_per_radix[11] = 11;
19018 registers_per_thread_per_radix[13] = 13;
19021 registers_per_thread_per_radix[2] = 0;
19022 registers_per_thread_per_radix[3] = 0;
19023 registers_per_thread_per_radix[5] = 0;
19024 registers_per_thread_per_radix[7] = 0;
19025 registers_per_thread_per_radix[11] = 11;
19026 registers_per_thread_per_radix[13] = 0;
19030 if (loc_multipliers[13] > 0) {
19031 registers_per_thread_per_radix[2] = 0;
19032 registers_per_thread_per_radix[3] = 0;
19033 registers_per_thread_per_radix[5] = 0;
19034 registers_per_thread_per_radix[7] = 0;
19035 registers_per_thread_per_radix[11] = 0;
19036 registers_per_thread_per_radix[13] = 13;
19047 for (uint64_t i = 0; i < 14; i++) {
19048 if ((registers_per_thread_per_radix[i] != 0) && (registers_per_thread_per_radix[i] < min_registers_per_thread[0])) min_registers_per_thread[0] = registers_per_thread_per_radix[i];
19049 if ((registers_per_thread_per_radix[i] != 0) && (registers_per_thread_per_radix[i] > registers_per_thread[0])) registers_per_thread[0] = registers_per_thread_per_radix[i];
19051 if ((registers_per_thread[0] > 10) || (registers_per_thread[0] >= 2 * min_registers_per_thread[0])) isGoodSequence[0] = 0;
19052 else isGoodSequence[0] = 1;
19059 uint64_t complexSize;
19061 complexSize = (2 *
sizeof(double));
19064 complexSize = (2 *
sizeof(float));
19066 complexSize = (2 *
sizeof(float));
19068 uint64_t maxSingleSizeNonStrided = maxSequenceLengthSharedMemory;
19070 for (uint64_t i = 0; i < 3; i++) {
19089 if (axis_id != nonStridedAxisId) {
19093 uint64_t multipliers[20] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 };
19095 for (uint64_t i = 2; i < 14; i++) {
19096 if (tempSequence % i == 0) {
19102 if (tempSequence != 1) {
19104 if (axis_id != nonStridedAxisId) {
19110 uint64_t FFTSizeSelected = 0;
19112 while (!FFTSizeSelected) {
19113 uint64_t testSequence = tempSequence;
19114 for (uint64_t i = 0; i < 20; i++) {
19115 multipliers[i] = 0;
19118 if (testSequence % i == 0) {
19124 if (testSequence == 1) FFTSizeSelected = 1;
19125 else tempSequence++;
19129 while (!FFTSizeSelected) {
19130 if (axis_id == nonStridedAxisId) {
19131 if ((FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] < 128) || ((((uint64_t)
pow(2, (uint64_t)ceil(log2(tempSequence))) * 0.75) <= tempSequence) && (((uint64_t)
pow(2, (uint64_t)ceil(log2(tempSequence))) <= maxSequenceLengthSharedMemory) || ((2 * FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] - 1) > maxSequenceLengthSharedMemory)))) tempSequence = (uint64_t)
pow(2, (uint64_t)ceil(log2(tempSequence)));
19135 if ((FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] < 128) || ((((uint64_t)
pow(2, (uint64_t)ceil(log2(tempSequence))) * 0.75) <= tempSequence) && (((uint64_t)
pow(2, (uint64_t)ceil(log2(tempSequence))) <= maxSequenceLengthSharedMemoryStrided_temp) || ((2 * FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] - 1) > maxSequenceLengthSharedMemoryStrided_temp)))) tempSequence = (uint64_t)
pow(2, (uint64_t)ceil(log2(tempSequence)));
19137 uint64_t testSequence = tempSequence;
19138 for (uint64_t i = 0; i < 20; i++) {
19139 multipliers[i] = 0;
19141 for (uint64_t i = 2; i < 8; i++) {
19142 if (testSequence % i == 0) {
19148 if (testSequence != 1) tempSequence++;
19150 uint64_t registers_per_thread_per_radix[14];
19151 uint64_t registers_per_thread = 0;
19152 uint64_t min_registers_per_thread = -1;
19153 uint64_t isGoodSequence = 0;
19154 res =
VkFFTGetRegistersPerThread(multipliers, registers_per_thread_per_radix, ®isters_per_thread, &min_registers_per_thread, &isGoodSequence);
19156 if (isGoodSequence) FFTSizeSelected = 1;
19157 else tempSequence++;
19168 uint64_t FFTSizeSelected = 0;
19170 while (!FFTSizeSelected) {
19171 uint64_t testSequence = tempSequence;
19172 for (uint64_t i = 0; i < 20; i++) {
19173 multipliers[i] = 0;
19176 if (testSequence % i == 0) {
19182 if (testSequence == 1) FFTSizeSelected = 1;
19183 else tempSequence++;
19187 while (!FFTSizeSelected) {
19188 if (axis_id == nonStridedAxisId) {
19189 if ((FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] < 128) || ((((uint64_t)
pow(2, (uint64_t)ceil(log2(tempSequence))) * 0.75) <= tempSequence) && (((uint64_t)
pow(2, (uint64_t)ceil(log2(tempSequence))) <= maxSequenceLengthSharedMemory) || ((2 * FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] - 1) > maxSequenceLengthSharedMemory)))) tempSequence = (uint64_t)
pow(2, (uint64_t)ceil(log2(tempSequence)));
19193 if ((FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] < 128) || ((((uint64_t)
pow(2, (uint64_t)ceil(log2(tempSequence))) * 0.75) <= tempSequence) && (((uint64_t)
pow(2, (uint64_t)ceil(log2(tempSequence))) <= maxSequenceLengthSharedMemoryStrided_temp) || ((2 * FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] - 1) > maxSequenceLengthSharedMemoryStrided_temp)))) tempSequence = (uint64_t)
pow(2, (uint64_t)ceil(log2(tempSequence)));
19195 uint64_t testSequence = tempSequence;
19196 for (uint64_t i = 0; i < 20; i++) {
19197 multipliers[i] = 0;
19199 for (uint64_t i = 2; i < 8; i++) {
19200 if (testSequence % i == 0) {
19206 if (testSequence != 1) tempSequence++;
19208 uint64_t registers_per_thread_per_radix[14];
19209 uint64_t registers_per_thread = 0;
19210 uint64_t min_registers_per_thread = -1;
19211 uint64_t isGoodSequence = 0;
19212 res =
VkFFTGetRegistersPerThread(multipliers, registers_per_thread_per_radix, ®isters_per_thread, &min_registers_per_thread, &isGoodSequence);
19214 if (isGoodSequence) FFTSizeSelected = 1;
19215 else tempSequence++;
19224 maxSingleSizeNonStrided = maxSequenceLengthSharedMemory;
19248 uint64_t registerBoost = 1;
19255 uint64_t maxSingleSizeStrided = (!app->
configuration.
performConvolution) ? maxSequenceLengthSharedMemoryStrided * registerBoost : maxSequenceLengthSharedMemoryStrided;
19256 uint64_t numPasses = 1;
19257 uint64_t numPassesHalfBandwidth = 1;
19259 temp = (axis_id == nonStridedAxisId) ? (uint64_t)ceil(FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / (
double)maxSingleSizeNonStrided) : (uint64_t)ceil(FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / (
double)maxSingleSizeStrided);
19270 numPasses = (uint64_t)ceil(log2(FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id]) / log2(maxSingleSizeStrided));
19272 numPasses += (uint64_t)ceil(log2(temp) / log2(maxSingleSizeStrided));
19274 registerBoost = ((axis_id == nonStridedAxisId) && ((app->
useBluesteinFFT[axis_id]) || (!app->
configuration.
reorderFourStep) || (numPasses == 1))) ? (uint64_t)ceil(FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / (
double)(
pow(maxSequenceLengthSharedMemoryStrided, numPasses - 1) * maxSequenceLengthSharedMemory)) : (uint64_t)ceil(FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / (
double)
pow(maxSequenceLengthSharedMemoryStrided, numPasses));
19275 uint64_t canBoost = 0;
19287 maxSingleSizeNonStrided = maxSequenceLengthSharedMemory * registerBoost;
19288 maxSingleSizeStrided = maxSequenceLengthSharedMemoryStrided * registerBoost;
19289 uint64_t maxSingleSizeStridedHalfBandwidth = maxSingleSizeStrided;
19292 temp = (axis_id == nonStridedAxisId) ? (uint64_t)ceil(FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / (
double)maxSingleSizeNonStrided) : (uint64_t)ceil(FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / (
double)maxSingleSizeStridedHalfBandwidth);
19296 for (uint64_t i = 0; i < 5; i++) {
19297 temp = (uint64_t)ceil(temp / (
double)maxSingleSizeStrided);
19298 numPassesHalfBandwidth++;
19299 if (temp == 1) i = 5;
19311 if (numPassesHalfBandwidth < numPasses) numPasses = numPassesHalfBandwidth;
19312 else maxSingleSizeStridedHalfBandwidth = maxSingleSizeStrided;
19315 uint64_t* locAxisSplit = FFTPlan->
axisSplit[axis_id];
19316 if (numPasses == 1) {
19319 if (numPasses == 2) {
19322 uint64_t maxPow8SharedMemory = (uint64_t)
pow(8, ((uint64_t)log2(maxSequenceLengthSharedMemory)) / 3);
19324 if (FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / maxPow8SharedMemory <= maxSingleSizeStrided) {
19325 locAxisSplit[0] = maxPow8SharedMemory;
19328 if (FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / maxSequenceLengthSharedMemory <= maxSingleSizeStrided) {
19329 locAxisSplit[0] = maxSequenceLengthSharedMemory;
19332 if (FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / (maxSequenceLengthSharedMemory * registerBoost) < maxSingleSizeStridedHalfBandwidth) {
19333 for (uint64_t i = 1; i <= (uint64_t)log2(registerBoost); i++) {
19334 if (FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / (maxSequenceLengthSharedMemory * (uint64_t)
pow(2, i)) <= maxSingleSizeStrided) {
19335 locAxisSplit[0] = (maxSequenceLengthSharedMemory * (uint64_t)
pow(2, i));
19336 i = (uint64_t)log2(registerBoost) + 1;
19341 locAxisSplit[0] = (maxSequenceLengthSharedMemory * registerBoost);
19347 uint64_t maxPow8Strided = (uint64_t)
pow(8, ((uint64_t)log2(maxSingleSizeStrided)) / 3);
19349 if (FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / maxPow8Strided <= maxSingleSizeStrided) {
19350 locAxisSplit[0] = maxPow8Strided;
19353 if (FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / maxSingleSizeStrided < maxSingleSizeStridedHalfBandwidth) {
19354 locAxisSplit[0] = maxSingleSizeStrided;
19357 locAxisSplit[0] = maxSingleSizeStridedHalfBandwidth;
19362 if (locAxisSplit[1] < 64) {
19363 locAxisSplit[0] = (locAxisSplit[1] == 0) ? locAxisSplit[0] / (64) : locAxisSplit[0] / (64 / locAxisSplit[1]);
19364 locAxisSplit[1] = 64;
19366 if (locAxisSplit[1] > locAxisSplit[0]) {
19367 uint64_t swap = locAxisSplit[0];
19368 locAxisSplit[0] = locAxisSplit[1];
19369 locAxisSplit[1] = swap;
19373 uint64_t successSplit = 0;
19386 for (uint64_t i = 0; i < sqrtSequence; i++) {
19388 if ((sqrtSequence - i <= maxSingleSizeStrided) && (FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / (sqrtSequence - i) <= maxSequenceLengthSharedMemory)) {
19390 locAxisSplit[1] = sqrtSequence - i;
19399 for (uint64_t i = 0; i < sqrtSequence; i++) {
19401 if ((sqrtSequence - i <= maxSingleSizeStrided) && (FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / (sqrtSequence - i) <= maxSingleSizeStridedHalfBandwidth)) {
19403 locAxisSplit[1] = sqrtSequence - i;
19410 if (successSplit == 0)
19414 if (numPasses == 3) {
19416 uint64_t maxPow8Strided = (uint64_t)
pow(8, ((uint64_t)log2(maxSingleSizeStrided)) / 3);
19419 uint64_t maxPow8SharedMemory = (uint64_t)
pow(8, ((uint64_t)log2(maxSequenceLengthSharedMemory)) / 3);
19420 if (FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / maxPow8SharedMemory <= maxPow8Strided * maxPow8Strided)
19421 locAxisSplit[0] = maxPow8SharedMemory;
19423 if (FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / maxSequenceLengthSharedMemory <= maxSingleSizeStrided * maxSingleSizeStrided)
19424 locAxisSplit[0] = maxSequenceLengthSharedMemory;
19426 if (FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / (maxSequenceLengthSharedMemory * registerBoost) <= maxSingleSizeStrided * maxSingleSizeStrided) {
19427 for (uint64_t i = 0; i <= (uint64_t)log2(registerBoost); i++) {
19428 if (FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / (maxSequenceLengthSharedMemory * (uint64_t)
pow(2, i)) <= maxSingleSizeStrided * maxSingleSizeStrided) {
19429 locAxisSplit[0] = (maxSequenceLengthSharedMemory * (uint64_t)
pow(2, i));
19430 i = (uint64_t)log2(registerBoost) + 1;
19435 locAxisSplit[0] = (maxSequenceLengthSharedMemory * registerBoost);
19449 uint64_t maxPow8_128 = (uint64_t)
pow(8, ((uint64_t)log2(maxSingleSizeStrided128)) / 3);
19451 if (FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / maxPow8_128 <= maxPow8Strided * maxSingleSizeStrided)
19452 locAxisSplit[0] = maxPow8_128;
19456 if ((FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / (maxPow8_128 * 2) <= maxPow8Strided * maxSingleSizeStrided) && (maxPow8_128 * 2 <= maxSingleSizeStrided128)) {
19457 locAxisSplit[0] = maxPow8_128 * 2;
19460 if ((FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / (maxPow8_128 * 4) <= maxPow8Strided * maxSingleSizeStrided) && (maxPow8_128 * 4 <= maxSingleSizeStrided128)) {
19461 locAxisSplit[0] = maxPow8_128 * 4;
19464 if (FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / maxSingleSizeStrided <= maxSingleSizeStrided * maxSingleSizeStrided) {
19465 for (uint64_t i = 0; i <= (uint64_t)log2(maxSingleSizeStrided / maxSingleSizeStrided128); i++) {
19466 if (FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / (maxSingleSizeStrided128 * (uint64_t)
pow(2, i)) <= maxSingleSizeStrided * maxSingleSizeStrided) {
19467 locAxisSplit[0] = (maxSingleSizeStrided128 * (uint64_t)
pow(2, i));
19468 i = (uint64_t)log2(maxSingleSizeStrided / maxSingleSizeStrided128) + 1;
19473 locAxisSplit[0] = maxSingleSizeStridedHalfBandwidth;
19478 if (FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / locAxisSplit[0] / maxPow8Strided <= maxSingleSizeStrided) {
19479 locAxisSplit[1] = maxPow8Strided;
19480 locAxisSplit[2] = FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / locAxisSplit[1] / locAxisSplit[0];
19483 if (FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / locAxisSplit[0] / maxSingleSizeStrided <= maxSingleSizeStrided) {
19484 locAxisSplit[1] = maxSingleSizeStrided;
19485 locAxisSplit[2] = FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / locAxisSplit[1] / locAxisSplit[0];
19488 locAxisSplit[1] = maxSingleSizeStridedHalfBandwidth;
19489 locAxisSplit[2] = FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / locAxisSplit[1] / locAxisSplit[0];
19492 if (locAxisSplit[2] < 64) {
19493 locAxisSplit[1] = (locAxisSplit[2] == 0) ? locAxisSplit[1] / (64) : locAxisSplit[1] / (64 / locAxisSplit[2]);
19494 locAxisSplit[2] = 64;
19496 if (locAxisSplit[2] > locAxisSplit[1]) {
19497 uint64_t swap = locAxisSplit[1];
19498 locAxisSplit[1] = locAxisSplit[2];
19499 locAxisSplit[2] = swap;
19503 uint64_t successSplit = 0;
19505 for (uint64_t i = 0; i < maxSequenceLengthSharedMemory; i++) {
19506 if (FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] % (maxSequenceLengthSharedMemory - i) == 0) {
19507 uint64_t sqrt3Sequence = (uint64_t)ceil(
sqrt(FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / (maxSequenceLengthSharedMemory - i)));
19508 for (uint64_t j = 0; j < sqrt3Sequence; j++) {
19509 if ((FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / (maxSequenceLengthSharedMemory - i)) % (sqrt3Sequence - j) == 0) {
19510 if (((maxSequenceLengthSharedMemory - i) <= maxSequenceLengthSharedMemory) && (sqrt3Sequence - j <= maxSingleSizeStrided) && (FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / (maxSequenceLengthSharedMemory - i) / (sqrt3Sequence - j) <= maxSingleSizeStrided)) {
19511 locAxisSplit[0] = (maxSequenceLengthSharedMemory - i);
19512 locAxisSplit[1] = sqrt3Sequence - j;
19513 locAxisSplit[2] = FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / (maxSequenceLengthSharedMemory - i) / (sqrt3Sequence - j);
19514 i = maxSequenceLengthSharedMemory;
19525 for (uint64_t i = 0; i < sqrt3Sequence; i++) {
19527 uint64_t sqrt2Sequence = (uint64_t)ceil(
sqrt(FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / (sqrt3Sequence - i)));
19528 for (uint64_t j = 0; j < sqrt2Sequence; j++) {
19529 if ((FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / (sqrt3Sequence - i)) % (sqrt2Sequence - j) == 0) {
19530 if ((sqrt3Sequence - i <= maxSingleSizeStrided) && (sqrt2Sequence - j <= maxSingleSizeStrided) && (FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / (sqrt3Sequence - i) / (sqrt2Sequence - j) <= maxSingleSizeStridedHalfBandwidth)) {
19531 locAxisSplit[0] = FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] / (sqrt3Sequence - i) / (sqrt2Sequence - j);
19532 locAxisSplit[1] = sqrt3Sequence - i;
19533 locAxisSplit[2] = sqrt2Sequence - j;
19543 if (successSplit == 0)
19547 if (numPasses > 3) {
19560 for (uint64_t i = 0; i < numPasses; i++) {
19561 if ((locAxisSplit[0] % 2 != 0) && (locAxisSplit[i] % 2 == 0)) {
19562 uint64_t swap = locAxisSplit[0];
19563 locAxisSplit[0] = locAxisSplit[i];
19564 locAxisSplit[i] = swap;
19567 for (uint64_t i = 0; i < numPasses; i++) {
19568 if ((locAxisSplit[0] % 4 != 0) && (locAxisSplit[i] % 4 == 0)) {
19569 uint64_t swap = locAxisSplit[0];
19570 locAxisSplit[0] = locAxisSplit[i];
19571 locAxisSplit[i] = swap;
19574 for (uint64_t i = 0; i < numPasses; i++) {
19575 if ((locAxisSplit[0] % 8 != 0) && (locAxisSplit[i] % 8 == 0)) {
19576 uint64_t swap = locAxisSplit[0];
19577 locAxisSplit[0] = locAxisSplit[i];
19578 locAxisSplit[i] = swap;
19583 for (uint64_t k = 0; k < numPasses; k++) {
19584 tempSequence = locAxisSplit[k];
19585 uint64_t loc_multipliers[20] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 };
19586 for (uint64_t i = 2; i < 14; i++) {
19587 if (tempSequence % i == 0) {
19589 loc_multipliers[i]++;
19593 uint64_t registers_per_thread_per_radix[14];
19594 uint64_t registers_per_thread = 0;
19595 uint64_t min_registers_per_thread = -1;
19596 uint64_t isGoodSequence = 0;
19597 res =
VkFFTGetRegistersPerThread(loc_multipliers, registers_per_thread_per_radix, ®isters_per_thread, &min_registers_per_thread, &isGoodSequence);
19599 registers_per_thread_per_radix[8] = registers_per_thread_per_radix[2];
19600 registers_per_thread_per_radix[4] = registers_per_thread_per_radix[2];
19601 if ((registerBoost == 4) && (registers_per_thread % 4 != 0)) {
19602 registers_per_thread *= 2;
19603 for (uint64_t i = 2; i < 14; i++) {
19604 registers_per_thread_per_radix[i] *= 2;
19606 min_registers_per_thread *= 2;
19608 if (registers_per_thread_per_radix[8] % 8 == 0) {
19609 loc_multipliers[8] = loc_multipliers[2] / 3;
19610 loc_multipliers[2] = loc_multipliers[2] - loc_multipliers[8] * 3;
19612 if (registers_per_thread_per_radix[4] % 4 == 0) {
19613 loc_multipliers[4] = loc_multipliers[2] / 2;
19614 loc_multipliers[2] = loc_multipliers[2] - loc_multipliers[4] * 2;
19616 if ((registerBoost == 2) && (loc_multipliers[2] == 0)) {
19617 if (loc_multipliers[4] > 0) {
19618 loc_multipliers[4]--;
19619 loc_multipliers[2] = 2;
19622 loc_multipliers[8]--;
19623 loc_multipliers[4]++;
19624 loc_multipliers[2]++;
19627 if ((registerBoost == 4) && (loc_multipliers[4] == 0)) {
19628 loc_multipliers[8]--;
19629 loc_multipliers[4]++;
19630 loc_multipliers[2]++;
19635 uint64_t scaleRegistersNum = 1;
19636 while ((maxBatchCoalesced * locAxisSplit[k] / (min_registers_per_thread * registerBoost * scaleRegistersNum)) > app->
configuration.
maxThreadsNum) {
19637 for (uint64_t i = 2; i < 14; i++) {
19638 if (locAxisSplit[k] / (min_registers_per_thread * registerBoost * scaleRegistersNum) % i == 0) {
19639 scaleRegistersNum *= i;
19644 min_registers_per_thread *= scaleRegistersNum;
19645 uint64_t temp_scaleRegistersNum = scaleRegistersNum;
19646 while ((maxBatchCoalesced * locAxisSplit[k] / (registers_per_thread * registerBoost)) % temp_scaleRegistersNum != 0) temp_scaleRegistersNum++;
19647 registers_per_thread *= temp_scaleRegistersNum;
19648 for (uint64_t i = 2; i < 14; i++) {
19649 if (registers_per_thread_per_radix[i] != 0) {
19650 temp_scaleRegistersNum = scaleRegistersNum;
19651 while ((maxBatchCoalesced * locAxisSplit[k] / (registers_per_thread_per_radix[i] * registerBoost)) % temp_scaleRegistersNum != 0) temp_scaleRegistersNum++;
19652 registers_per_thread_per_radix[i] *= temp_scaleRegistersNum;
19656 if (min_registers_per_thread > registers_per_thread) {
19657 uint64_t temp = min_registers_per_thread;
19658 min_registers_per_thread = registers_per_thread;
19659 registers_per_thread = temp;
19661 for (uint64_t i = 2; i < 14; i++) {
19662 if (registers_per_thread_per_radix[i] > registers_per_thread) {
19663 registers_per_thread = registers_per_thread_per_radix[i];
19665 if ((registers_per_thread_per_radix[i] > 0) && (registers_per_thread_per_radix[i] < min_registers_per_thread)) {
19666 min_registers_per_thread = registers_per_thread_per_radix[i];
19674 for (uint64_t i = 2; i < 14; i++) {
19679 uint64_t tempRegisterBoost = registerBoost;
19680 uint64_t switchRegisterBoost = 0;
19681 if (tempRegisterBoost > 1) {
19682 if (loc_multipliers[tempRegisterBoost] > 0) {
19683 loc_multipliers[tempRegisterBoost]--;
19684 switchRegisterBoost = tempRegisterBoost;
19687 for (uint64_t i = 14; i > 1; i--) {
19688 if (loc_multipliers[i] > 0) {
19689 loc_multipliers[i]--;
19690 switchRegisterBoost = i;
19696 for (uint64_t i = 14; i > 1; i--) {
19697 if (loc_multipliers[i] > 0) {
19699 loc_multipliers[i]--;
19705 if (switchRegisterBoost > 0) {
19710 if (min_registers_per_thread != registers_per_thread) {
19726 double double_PI = 3.1415926535897932384626433832795;
19731 kernelPreparationConfiguration.
FFTdim = 1;
19733 kernelPreparationConfiguration.
size[1] = 1;
19734 kernelPreparationConfiguration.
size[2] = 1;
19736 kernelPreparationConfiguration.
useLUT = 1;
19749#if(VKFFT_BACKEND==0)
19756#elif(VKFFT_BACKEND==3)
19757 kernelPreparationConfiguration.platform = app->
configuration.platform;
19758 kernelPreparationConfiguration.context = app->
configuration.context;
19761 uint64_t bufferSize = (uint64_t)
sizeof(
float) * 2 * kernelPreparationConfiguration.
size[0] * kernelPreparationConfiguration.
size[1] * kernelPreparationConfiguration.
size[2];
19762 if (kernelPreparationConfiguration.
doublePrecision) bufferSize *=
sizeof(double) /
sizeof(
float);
19768 resFFT =
initializeVkFFT(&kernelPreparationApplication, kernelPreparationConfiguration);
19771#if(VKFFT_BACKEND==0)
19772 VkResult res = VK_SUCCESS;
19783#elif(VKFFT_BACKEND==1)
19784 cudaError_t res = cudaSuccess;
19785 res = cudaMalloc((
void**)&app->
bufferBluestein[axis_id], bufferSize);
19795#elif(VKFFT_BACKEND==2)
19796 hipError_t res = hipSuccess;
19807#elif(VKFFT_BACKEND==3)
19808 cl_int res = CL_SUCCESS;
19822 void* phaseVectors = malloc(bufferSize);
19823 if (!phaseVectors) {
19832 double* phaseVectors_cast = (
double*)phaseVectors;
19834 uint64_t rm = (i * i) % (2 * phaseVectorsNonZeroSize);
19835 double angle = double_PI * rm / phaseVectorsNonZeroSize;
19836 phaseVectors_cast[2 * i] = (i < phaseVectorsNonZeroSize) ? (
double)
cos(
angle) : 0;
19837 phaseVectors_cast[2 * i + 1] = (i < phaseVectorsNonZeroSize) ? (
double)-
sin(
angle) : 0;
19839 for (uint64_t i = 1; i < phaseVectorsNonZeroSize; i++) {
19840 phaseVectors_cast[2 * (FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] - i)] = phaseVectors_cast[2 * i];
19841 phaseVectors_cast[2 * (FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] - i) + 1] = phaseVectors_cast[2 * i + 1];
19845 float* phaseVectors_cast = (
float*)phaseVectors;
19847 uint64_t rm = (i * i) % (2 * phaseVectorsNonZeroSize);
19848 double angle = double_PI * rm / phaseVectorsNonZeroSize;
19849 phaseVectors_cast[2 * i] = (i < phaseVectorsNonZeroSize) ? (
float)
cos(
angle) : 0;
19850 phaseVectors_cast[2 * i + 1] = (i < phaseVectorsNonZeroSize) ? (
float)-
sin(
angle) : 0;
19852 for (uint64_t i = 1; i < phaseVectorsNonZeroSize; i++) {
19853 phaseVectors_cast[2 * (FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] - i)] = phaseVectors_cast[2 * i];
19854 phaseVectors_cast[2 * (FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] - i) + 1] = phaseVectors_cast[2 * i + 1];
19857#if(VKFFT_BACKEND==0)
19860 free(phaseVectors);
19864#elif(VKFFT_BACKEND==1)
19865 res = cudaMemcpy(app->
bufferBluestein[axis_id], phaseVectors, bufferSize, cudaMemcpyHostToDevice);
19866 if (res != cudaSuccess) {
19867 free(phaseVectors);
19871#elif(VKFFT_BACKEND==2)
19872 res = hipMemcpy(app->
bufferBluestein[axis_id], phaseVectors, bufferSize, hipMemcpyHostToDevice);
19873 if (res != hipSuccess) {
19874 free(phaseVectors);
19878#elif(VKFFT_BACKEND==3)
19879 res = clEnqueueWriteBuffer(commandQueue, app->
bufferBluestein[axis_id], CL_TRUE, 0, bufferSize, phaseVectors, 0, NULL, NULL);
19880 if (res != CL_SUCCESS) {
19881 free(phaseVectors);
19886#if(VKFFT_BACKEND==0)
19892 VkCommandBuffer commandBuffer = {};
19895 free(phaseVectors);
19903 free(phaseVectors);
19912 resFFT =
VkFFTAppend(&kernelPreparationApplication, -1, &launchParams);
19914 free(phaseVectors);
19918 res = vkEndCommandBuffer(commandBuffer);
19920 free(phaseVectors);
19924 VkSubmitInfo
submitInfo = { VK_STRUCTURE_TYPE_SUBMIT_INFO };
19926 submitInfo.pCommandBuffers = &commandBuffer;
19929 free(phaseVectors);
19935 free(phaseVectors);
19941 free(phaseVectors);
19947#elif(VKFFT_BACKEND==1)
19951 resFFT =
VkFFTAppend(&kernelPreparationApplication, -1, &launchParams);
19953 free(phaseVectors);
19957 res = cudaDeviceSynchronize();
19958 if (res != cudaSuccess) {
19959 free(phaseVectors);
19963#elif(VKFFT_BACKEND==2)
19967 resFFT =
VkFFTAppend(&kernelPreparationApplication, -1, &launchParams);
19969 free(phaseVectors);
19973 res = hipDeviceSynchronize();
19974 if (res != hipSuccess) {
19975 free(phaseVectors);
19979#elif(VKFFT_BACKEND==3)
19981 launchParams.commandQueue = &commandQueue;
19984 resFFT =
VkFFTAppend(&kernelPreparationApplication, -1, &launchParams);
19986 free(phaseVectors);
19990 res = clFinish(commandQueue);
19991 if (res != CL_SUCCESS) {
19992 free(phaseVectors);
19999 double* phaseVectors_cast = (
double*)phaseVectors;
20001 uint64_t rm = (i * i) % (2 * phaseVectorsNonZeroSize);
20002 double angle = double_PI * rm / phaseVectorsNonZeroSize;
20003 phaseVectors_cast[2 * i] = (i < phaseVectorsNonZeroSize) ? (
double)
cos(
angle) : 0;
20004 phaseVectors_cast[2 * i + 1] = (i < phaseVectorsNonZeroSize) ? (
double)
sin(
angle) : 0;
20006 for (uint64_t i = 1; i < phaseVectorsNonZeroSize; i++) {
20007 phaseVectors_cast[2 * (FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] - i)] = phaseVectors_cast[2 * i];
20008 phaseVectors_cast[2 * (FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] - i) + 1] = phaseVectors_cast[2 * i + 1];
20012 float* phaseVectors_cast = (
float*)phaseVectors;
20014 uint64_t rm = (i * i) % (2 * phaseVectorsNonZeroSize);
20015 double angle = double_PI * rm / phaseVectorsNonZeroSize;
20016 phaseVectors_cast[2 * i] = (i < phaseVectorsNonZeroSize) ? (
float)
cos(
angle) : 0;
20017 phaseVectors_cast[2 * i + 1] = (i < phaseVectorsNonZeroSize) ? (
float)
sin(
angle) : 0;
20019 for (uint64_t i = 1; i < phaseVectorsNonZeroSize; i++) {
20020 phaseVectors_cast[2 * (FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] - i)] = phaseVectors_cast[2 * i];
20021 phaseVectors_cast[2 * (FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id] - i) + 1] = phaseVectors_cast[2 * i + 1];
20024#if(VKFFT_BACKEND==0)
20027 free(phaseVectors);
20031#elif(VKFFT_BACKEND==1)
20032 res = cudaMemcpy(app->
bufferBluestein[axis_id], phaseVectors, bufferSize, cudaMemcpyHostToDevice);
20033 if (res != cudaSuccess) {
20034 free(phaseVectors);
20038#elif(VKFFT_BACKEND==2)
20039 res = hipMemcpy(app->
bufferBluestein[axis_id], phaseVectors, bufferSize, hipMemcpyHostToDevice);
20040 if (res != hipSuccess) {
20041 free(phaseVectors);
20045#elif(VKFFT_BACKEND==3)
20046 res = clEnqueueWriteBuffer(commandQueue, app->
bufferBluestein[axis_id], CL_TRUE, 0, bufferSize, phaseVectors, 0, NULL, NULL);
20047 if (res != CL_SUCCESS) {
20048 free(phaseVectors);
20053#if(VKFFT_BACKEND==0)
20059 VkCommandBuffer commandBuffer = {};
20062 free(phaseVectors);
20070 free(phaseVectors);
20079 resFFT =
VkFFTAppend(&kernelPreparationApplication, -1, &launchParams);
20081 free(phaseVectors);
20085 res = vkEndCommandBuffer(commandBuffer);
20087 free(phaseVectors);
20091 VkSubmitInfo
submitInfo = { VK_STRUCTURE_TYPE_SUBMIT_INFO };
20093 submitInfo.pCommandBuffers = &commandBuffer;
20096 free(phaseVectors);
20102 free(phaseVectors);
20108 free(phaseVectors);
20119 VkCommandBuffer commandBuffer = {};
20122 free(phaseVectors);
20130 free(phaseVectors);
20139 resFFT =
VkFFTAppend(&kernelPreparationApplication, 1, &launchParams);
20141 free(phaseVectors);
20145 res = vkEndCommandBuffer(commandBuffer);
20147 free(phaseVectors);
20151 VkSubmitInfo
submitInfo = { VK_STRUCTURE_TYPE_SUBMIT_INFO };
20153 submitInfo.pCommandBuffers = &commandBuffer;
20156 free(phaseVectors);
20162 free(phaseVectors);
20168 free(phaseVectors);
20174#elif(VKFFT_BACKEND==1)
20179 resFFT =
VkFFTAppend(&kernelPreparationApplication, -1, &launchParams);
20181 free(phaseVectors);
20185 res = cudaDeviceSynchronize();
20186 if (res != cudaSuccess) {
20187 free(phaseVectors);
20194 resFFT =
VkFFTAppend(&kernelPreparationApplication, 1, &launchParams);
20196 free(phaseVectors);
20200 res = cudaDeviceSynchronize();
20201 if (res != cudaSuccess) {
20202 free(phaseVectors);
20207#elif(VKFFT_BACKEND==2)
20212 resFFT =
VkFFTAppend(&kernelPreparationApplication, -1, &launchParams);
20214 free(phaseVectors);
20218 res = hipDeviceSynchronize();
20219 if (res != hipSuccess) {
20220 free(phaseVectors);
20227 resFFT =
VkFFTAppend(&kernelPreparationApplication, 1, &launchParams);
20229 free(phaseVectors);
20233 res = hipDeviceSynchronize();
20234 if (res != hipSuccess) {
20235 free(phaseVectors);
20240#elif(VKFFT_BACKEND==3)
20242 launchParams.commandQueue = &commandQueue;
20246 resFFT =
VkFFTAppend(&kernelPreparationApplication, -1, &launchParams);
20248 free(phaseVectors);
20252 res = clFinish(commandQueue);
20253 if (res != CL_SUCCESS) {
20254 free(phaseVectors);
20261 resFFT =
VkFFTAppend(&kernelPreparationApplication, 1, &launchParams);
20263 free(phaseVectors);
20267 res = clFinish(commandQueue);
20268 if (res != CL_SUCCESS) {
20269 free(phaseVectors);
20275 free(phaseVectors);
20276#if(VKFFT_BACKEND==0)
20278#elif(VKFFT_BACKEND==3)
20279 res = clReleaseCommandQueue(commandQueue);
20286 uint64_t performUpdate = planStage;
20288 if (launchParams != 0) {
20347 if (performUpdate) {
20348 if (planStage)
axis->specializationConstants.performBufferSetUpdate = 1;
20381 if (
axis->specializationConstants.performBufferSetUpdate) {
20382#if(VKFFT_BACKEND==0)
20383 const VkDescriptorType descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
20385 uint64_t storageComplexSize;
20387 storageComplexSize = (2 *
sizeof(double));
20390 storageComplexSize = (2 * 2);
20392 storageComplexSize = (2 *
sizeof(float));
20393 for (uint64_t i = 0; i <
axis->numBindings; ++i) {
20394 for (uint64_t j = 0; j <
axis->specializationConstants.numBuffersBound[i]; ++j) {
20395#if(VKFFT_BACKEND==0)
20396 VkDescriptorBufferInfo descriptorBufferInfo = { 0 };
20403 uint64_t bufferId = 0;
20404 uint64_t offset = j;
20408 if (offset >= (uint64_t)ceil(app->
configuration.
inputBufferSize[l] / (
double)(
axis->specializationConstants.inputBufferBlockSize * storageComplexSize))) {
20419#if(VKFFT_BACKEND==0)
20421 descriptorBufferInfo.range = (
axis->specializationConstants.inputBufferBlockSize * storageComplexSize);
20422 descriptorBufferInfo.offset = offset * (
axis->specializationConstants.inputBufferBlockSize * storageComplexSize);
20428 uint64_t bufferId = 0;
20429 uint64_t offset = j;
20444#if(VKFFT_BACKEND==0)
20446 descriptorBufferInfo.range = (
axis->specializationConstants.inputBufferBlockSize * storageComplexSize);
20447 descriptorBufferInfo.offset = offset * (
axis->specializationConstants.inputBufferBlockSize * storageComplexSize);
20452 uint64_t bufferId = 0;
20453 uint64_t offset = j;
20455 if (((
axis->specializationConstants.reorderFourStep == 1) && (axis_upload_id > 0)) || (app->
useBluesteinFFT[axis_id] && (
axis->specializationConstants.reverseBluesteinMultiUpload == 0) && (axis_upload_id == FFTPlan->
numAxisUploads[axis_id] - 1))) {
20459 if (offset >= (uint64_t)ceil(app->
configuration.
bufferSize[l] / (
double)(
axis->specializationConstants.inputBufferBlockSize * storageComplexSize))) {
20461 offset -= (uint64_t)ceil(app->
configuration.
bufferSize[l] / (
double)(
axis->specializationConstants.inputBufferBlockSize * storageComplexSize));
20470#if(VKFFT_BACKEND==0)
20478 if (offset >= (uint64_t)ceil(app->
configuration.
tempBufferSize[l] / (
double)(
axis->specializationConstants.inputBufferBlockSize * storageComplexSize))) {
20489#if(VKFFT_BACKEND==0)
20498 if (offset >= (uint64_t)ceil(app->
configuration.
bufferSize[l] / (
double)(
axis->specializationConstants.inputBufferBlockSize * storageComplexSize))) {
20500 offset -= (uint64_t)ceil(app->
configuration.
bufferSize[l] / (
double)(
axis->specializationConstants.inputBufferBlockSize * storageComplexSize));
20509#if(VKFFT_BACKEND==0)
20514#if(VKFFT_BACKEND==0)
20515 descriptorBufferInfo.range = (
axis->specializationConstants.inputBufferBlockSize * storageComplexSize);
20516 descriptorBufferInfo.offset = offset * (
axis->specializationConstants.inputBufferBlockSize * storageComplexSize);
20536 uint64_t bufferId = 0;
20537 uint64_t offset = j;
20551#if(VKFFT_BACKEND==0)
20553 descriptorBufferInfo.range = (
axis->specializationConstants.outputBufferBlockSize * storageComplexSize);
20554 descriptorBufferInfo.offset = offset * (
axis->specializationConstants.outputBufferBlockSize * storageComplexSize);
20559 uint64_t bufferId = 0;
20560 uint64_t offset = j;
20569 if (offset >= (uint64_t)ceil(app->
configuration.
inputBufferSize[l] / (
double)(
axis->specializationConstants.inputBufferBlockSize * storageComplexSize))) {
20580#if(VKFFT_BACKEND==0)
20586 if (((
axis->specializationConstants.reorderFourStep == 1) && (axis_upload_id == 1)) || (app->
useBluesteinFFT[axis_id] && (!((axis_upload_id == FFTPlan->
numAxisUploads[axis_id] - 1) && (
axis->specializationConstants.reverseBluesteinMultiUpload == 1))))) {
20589 if (offset >= (uint64_t)ceil(app->
configuration.
tempBufferSize[l] / (
double)(
axis->specializationConstants.outputBufferBlockSize * storageComplexSize))) {
20600#if(VKFFT_BACKEND==0)
20608 if (offset >= (uint64_t)ceil(app->
configuration.
bufferSize[l] / (
double)(
axis->specializationConstants.outputBufferBlockSize * storageComplexSize))) {
20610 offset -= (uint64_t)ceil(app->
configuration.
bufferSize[l] / (
double)(
axis->specializationConstants.outputBufferBlockSize * storageComplexSize));
20619#if(VKFFT_BACKEND==0)
20630 if (offset >= (uint64_t)ceil(app->
configuration.
inputBufferSize[l] / (
double)(
axis->specializationConstants.inputBufferBlockSize * storageComplexSize))) {
20641#if(VKFFT_BACKEND==0)
20649 if (offset >= (uint64_t)ceil(app->
configuration.
bufferSize[l] / (
double)(
axis->specializationConstants.outputBufferBlockSize * storageComplexSize))) {
20651 offset -= (uint64_t)ceil(app->
configuration.
bufferSize[l] / (
double)(
axis->specializationConstants.outputBufferBlockSize * storageComplexSize));
20660#if(VKFFT_BACKEND==0)
20666#if(VKFFT_BACKEND==0)
20667 descriptorBufferInfo.range = (
axis->specializationConstants.outputBufferBlockSize * storageComplexSize);
20668 descriptorBufferInfo.offset = offset * (
axis->specializationConstants.outputBufferBlockSize * storageComplexSize);
20674 uint64_t bufferId = 0;
20675 uint64_t offset = j;
20678 if (offset >= (uint64_t)ceil(app->
configuration.
kernelSize[l] / (
double)(
axis->specializationConstants.outputBufferBlockSize * storageComplexSize))) {
20680 offset -= (uint64_t)ceil(app->
configuration.
kernelSize[l] / (
double)(
axis->specializationConstants.outputBufferBlockSize * storageComplexSize));
20688#if(VKFFT_BACKEND==0)
20690 descriptorBufferInfo.range = (
axis->specializationConstants.kernelBlockSize * storageComplexSize);
20691 descriptorBufferInfo.offset = offset * (
axis->specializationConstants.kernelBlockSize * storageComplexSize);
20696#if(VKFFT_BACKEND==0)
20697 descriptorBufferInfo.buffer =
axis->bufferLUT;
20698 descriptorBufferInfo.offset = 0;
20699 descriptorBufferInfo.range =
axis->bufferLUTSize;
20702 if ((i ==
axis->specializationConstants.BluesteinConvolutionBindingID) && (app->
useBluesteinFFT[axis_id]) && (axis_upload_id == 0)) {
20703#if(VKFFT_BACKEND==0)
20704 if (
axis->specializationConstants.inverseBluestein)
20708 descriptorBufferInfo.offset = 0;
20712 if ((i ==
axis->specializationConstants.BluesteinMultiplicationBindingID) && (app->
useBluesteinFFT[axis_id]) && (axis_upload_id == (FFTPlan->
numAxisUploads[axis_id] - 1))) {
20713#if(VKFFT_BACKEND==0)
20715 descriptorBufferInfo.offset = 0;
20719#if(VKFFT_BACKEND==0)
20720 VkWriteDescriptorSet
writeDescriptorSet = { VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET };
20731 axis->specializationConstants.performBufferSetUpdate = 0;
20736 if (
axis->specializationConstants.performBufferSetUpdate) {
20737#if(VKFFT_BACKEND==0)
20738 const VkDescriptorType descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
20740 uint64_t storageComplexSize;
20742 storageComplexSize = (2 *
sizeof(double));
20745 storageComplexSize = (2 * 2);
20747 storageComplexSize = (2 *
sizeof(float));
20748 for (uint64_t i = 0; i <
axis->numBindings; ++i) {
20749 for (uint64_t j = 0; j <
axis->specializationConstants.numBuffersBound[i]; ++j) {
20750#if(VKFFT_BACKEND==0)
20751 VkDescriptorBufferInfo descriptorBufferInfo = { 0 };
20754 uint64_t bufferId = 0;
20755 uint64_t offset = j;
20761 uint64_t bufferId = 0;
20762 uint64_t offset = j;
20765 if (offset >= (uint64_t)ceil(app->
configuration.
inputBufferSize[l] / (
double)(
axis->specializationConstants.inputBufferBlockSize * storageComplexSize))) {
20776#if(VKFFT_BACKEND==0)
20778 descriptorBufferInfo.range = (
axis->specializationConstants.inputBufferBlockSize * storageComplexSize);
20779 descriptorBufferInfo.offset = offset * (
axis->specializationConstants.inputBufferBlockSize * storageComplexSize);
20785 uint64_t bufferId = 0;
20786 uint64_t offset = j;
20800#if(VKFFT_BACKEND==0)
20802 descriptorBufferInfo.range = (
axis->specializationConstants.inputBufferBlockSize * storageComplexSize);
20803 descriptorBufferInfo.offset = offset * (
axis->specializationConstants.inputBufferBlockSize * storageComplexSize);
20808 uint64_t bufferId = 0;
20809 uint64_t offset = j;
20812 if (offset >= (uint64_t)ceil(app->
configuration.
bufferSize[l] / (
double)(
axis->specializationConstants.inputBufferBlockSize * storageComplexSize))) {
20814 offset -= (uint64_t)ceil(app->
configuration.
bufferSize[l] / (
double)(
axis->specializationConstants.inputBufferBlockSize * storageComplexSize));
20823#if(VKFFT_BACKEND==0)
20827#if(VKFFT_BACKEND==0)
20828 descriptorBufferInfo.range = (
axis->specializationConstants.inputBufferBlockSize * storageComplexSize);
20829 descriptorBufferInfo.offset = offset * (
axis->specializationConstants.inputBufferBlockSize * storageComplexSize);
20848 uint64_t bufferId = 0;
20849 uint64_t offset = j;
20863#if(VKFFT_BACKEND==0)
20865 descriptorBufferInfo.range = (
axis->specializationConstants.outputBufferBlockSize * storageComplexSize);
20866 descriptorBufferInfo.offset = offset * (
axis->specializationConstants.outputBufferBlockSize * storageComplexSize);
20871 uint64_t bufferId = 0;
20872 uint64_t offset = j;
20875 if (offset >= (uint64_t)ceil(app->
configuration.
bufferSize[l] / (
double)(
axis->specializationConstants.outputBufferBlockSize * storageComplexSize))) {
20877 offset -= (uint64_t)ceil(app->
configuration.
bufferSize[l] / (
double)(
axis->specializationConstants.outputBufferBlockSize * storageComplexSize));
20886#if(VKFFT_BACKEND==0)
20891#if(VKFFT_BACKEND==0)
20892 descriptorBufferInfo.range = (
axis->specializationConstants.outputBufferBlockSize * storageComplexSize);
20893 descriptorBufferInfo.offset = offset * (
axis->specializationConstants.outputBufferBlockSize * storageComplexSize);
20901 uint64_t bufferId = 0;
20902 uint64_t offset = j;
20916#if(VKFFT_BACKEND==0)
20918 descriptorBufferInfo.range = (
axis->specializationConstants.outputBufferBlockSize * storageComplexSize);
20919 descriptorBufferInfo.offset = offset * (
axis->specializationConstants.outputBufferBlockSize * storageComplexSize);
20924 uint64_t bufferId = 0;
20925 uint64_t offset = j;
20928 if (offset >= (uint64_t)ceil(app->
configuration.
bufferSize[l] / (
double)(
axis->specializationConstants.outputBufferBlockSize * storageComplexSize))) {
20930 offset -= (uint64_t)ceil(app->
configuration.
bufferSize[l] / (
double)(
axis->specializationConstants.outputBufferBlockSize * storageComplexSize));
20939#if(VKFFT_BACKEND==0)
20943#if(VKFFT_BACKEND==0)
20944 descriptorBufferInfo.range = (
axis->specializationConstants.outputBufferBlockSize * storageComplexSize);
20945 descriptorBufferInfo.offset = offset * (
axis->specializationConstants.outputBufferBlockSize * storageComplexSize);
20963 uint64_t bufferId = 0;
20964 uint64_t offset = j;
20978#if(VKFFT_BACKEND==0)
20980 descriptorBufferInfo.range = (
axis->specializationConstants.outputBufferBlockSize * storageComplexSize);
20981 descriptorBufferInfo.offset = offset * (
axis->specializationConstants.outputBufferBlockSize * storageComplexSize);
20986 uint64_t bufferId = 0;
20987 uint64_t offset = j;
20990 if (offset >= (uint64_t)ceil(app->
configuration.
bufferSize[l] / (
double)(
axis->specializationConstants.outputBufferBlockSize * storageComplexSize))) {
20992 offset -= (uint64_t)ceil(app->
configuration.
bufferSize[l] / (
double)(
axis->specializationConstants.outputBufferBlockSize * storageComplexSize));
21001#if(VKFFT_BACKEND==0)
21006#if(VKFFT_BACKEND==0)
21007 descriptorBufferInfo.range = (
axis->specializationConstants.outputBufferBlockSize * storageComplexSize);
21008 descriptorBufferInfo.offset = offset * (
axis->specializationConstants.outputBufferBlockSize * storageComplexSize);
21014 uint64_t bufferId = 0;
21015 uint64_t offset = j;
21018 if (offset >= (uint64_t)ceil(app->
configuration.
kernelSize[l] / (
double)(
axis->specializationConstants.outputBufferBlockSize * storageComplexSize))) {
21020 offset -= (uint64_t)ceil(app->
configuration.
kernelSize[l] / (
double)(
axis->specializationConstants.outputBufferBlockSize * storageComplexSize));
21028#if(VKFFT_BACKEND==0)
21030 descriptorBufferInfo.range = (
axis->specializationConstants.kernelBlockSize * storageComplexSize);
21031 descriptorBufferInfo.offset = offset * (
axis->specializationConstants.kernelBlockSize * storageComplexSize);
21036#if(VKFFT_BACKEND==0)
21037 descriptorBufferInfo.buffer =
axis->bufferLUT;
21038 descriptorBufferInfo.offset = 0;
21039 descriptorBufferInfo.range =
axis->bufferLUTSize;
21042#if(VKFFT_BACKEND==0)
21043 VkWriteDescriptorSet
writeDescriptorSet = { VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET };
21054 axis->specializationConstants.performBufferSetUpdate = 0;
21061#if(VKFFT_BACKEND==0)
21062 VkResult res = VK_SUCCESS;
21063#elif(VKFFT_BACKEND==1)
21064 cudaError_t res = cudaSuccess;
21065#elif(VKFFT_BACKEND==2)
21066 hipError_t res = hipSuccess;
21067#elif(VKFFT_BACKEND==3)
21068 cl_int res = CL_SUCCESS;
21075 uint64_t complexSize;
21077 complexSize = (2 *
sizeof(double));
21080 complexSize = (2 *
sizeof(float));
21082 complexSize = (2 *
sizeof(float));
21083 axis->specializationConstants.complexSize = complexSize;
21084 axis->specializationConstants.supportAxis = 0;
21089 axis->specializationConstants.dispatchZactualFFTSize = 1;
21092 double double_PI = 3.1415926535897932384626433832795;
21095 double* tempLUT = (
double*)malloc(
axis->bufferLUTSize);
21102 tempLUT[2 * i] = (double)
cos(
angle);
21103 tempLUT[2 * i + 1] = (double)
sin(
angle);
21105 axis->referenceLUT = 0;
21108#if(VKFFT_BACKEND==0)
21112 axis->referenceLUT = 1;
21115#if(VKFFT_BACKEND==0)
21116 resFFT =
allocateFFTBuffer(app, &
axis->bufferLUT, &
axis->bufferLUTDeviceMemory, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, VK_MEMORY_HEAP_DEVICE_LOCAL_BIT,
axis->bufferLUTSize);
21130#elif(VKFFT_BACKEND==1)
21131 res = cudaMalloc((
void**)&
axis->bufferLUT,
axis->bufferLUTSize);
21132 if (res != cudaSuccess) {
21138 res = cudaMemcpy(
axis->bufferLUT, tempLUT,
axis->bufferLUTSize, cudaMemcpyHostToDevice);
21139 if (res != cudaSuccess) {
21145#elif(VKFFT_BACKEND==2)
21146 res = hipMalloc((
void**)&
axis->bufferLUT,
axis->bufferLUTSize);
21147 if (res != hipSuccess) {
21153 res = hipMemcpy(
axis->bufferLUT, tempLUT,
axis->bufferLUTSize, hipMemcpyHostToDevice);
21154 if (res != hipSuccess) {
21160#elif(VKFFT_BACKEND==3)
21161 axis->bufferLUT = clCreateBuffer(app->
configuration.context[0], CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
axis->bufferLUTSize, tempLUT, &res);
21162 if (res != CL_SUCCESS) {
21175 float* tempLUT = (
float*)malloc(
axis->bufferLUTSize);
21182 tempLUT[2 * i] = (float)
cos(
angle);
21183 tempLUT[2 * i + 1] = (float)
sin(
angle);
21185 axis->referenceLUT = 0;
21188#if(VKFFT_BACKEND==0)
21192 axis->referenceLUT = 1;
21195#if(VKFFT_BACKEND==0)
21196 resFFT =
allocateFFTBuffer(app, &
axis->bufferLUT, &
axis->bufferLUTDeviceMemory, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, VK_MEMORY_HEAP_DEVICE_LOCAL_BIT,
axis->bufferLUTSize);
21210#elif(VKFFT_BACKEND==1)
21211 res = cudaMalloc((
void**)&
axis->bufferLUT,
axis->bufferLUTSize);
21212 if (res != cudaSuccess) {
21218 res = cudaMemcpy(
axis->bufferLUT, tempLUT,
axis->bufferLUTSize, cudaMemcpyHostToDevice);
21219 if (res != cudaSuccess) {
21225#elif(VKFFT_BACKEND==2)
21226 res = hipMalloc((
void**)&
axis->bufferLUT,
axis->bufferLUTSize);
21227 if (res != hipSuccess) {
21233 res = hipMemcpy(
axis->bufferLUT, tempLUT,
axis->bufferLUTSize, hipMemcpyHostToDevice);
21234 if (res != hipSuccess) {
21240#elif(VKFFT_BACKEND==3)
21241 axis->bufferLUT = clCreateBuffer(app->
configuration.context[0], CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
axis->bufferLUTSize, tempLUT, &res);
21242 if (res != CL_SUCCESS) {
21255 uint64_t* axisStride =
axis->specializationConstants.inputStride;
21256 uint64_t* usedStride = 0;
21269 axisStride[0] = usedStride[0];
21270 axisStride[1] = usedStride[1];
21271 axisStride[2] = usedStride[2];
21272 axisStride[3] = usedStride[3];
21273 axisStride[4] = usedStride[4];
21275 axisStride =
axis->specializationConstants.outputStride;
21276 usedStride =
axis->specializationConstants.inputStride;
21278 axisStride[0] = usedStride[0];
21279 axisStride[1] = usedStride[1];
21280 axisStride[2] = usedStride[2];
21281 axisStride[3] = usedStride[3];
21282 axisStride[4] = usedStride[4];
21286 uint64_t storageComplexSize;
21288 storageComplexSize = (2 *
sizeof(double));
21291 storageComplexSize = (2 * 2);
21293 storageComplexSize = (2 *
sizeof(float));
21295 uint64_t initPageSize = -1;
21296 uint64_t locBufferNum = 1;
21297 uint64_t locBufferSize = 0;
21311 uint64_t axis_id = 0;
21312 uint64_t axis_upload_id = 0;
21315 uint64_t totalSize = 0;
21316 uint64_t locPageSize = initPageSize;
21322 uint64_t totalSize = 0;
21323 uint64_t locPageSize = initPageSize;
21332 axis->specializationConstants.inputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (
double)storageComplexSize);
21333 axis->specializationConstants.inputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (
double)(
axis->specializationConstants.inputBufferBlockSize * storageComplexSize));
21339 uint64_t totalSize = 0;
21340 uint64_t locPageSize = initPageSize;
21349 axis->specializationConstants.inputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (
double)storageComplexSize);
21350 axis->specializationConstants.inputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (
double)(
axis->specializationConstants.inputBufferBlockSize * storageComplexSize));
21355 uint64_t totalSize = 0;
21356 uint64_t locPageSize = initPageSize;
21366 axis->specializationConstants.inputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (
double)storageComplexSize);
21367 axis->specializationConstants.inputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (
double)(
axis->specializationConstants.inputBufferBlockSize * storageComplexSize));
21387 uint64_t totalSize = 0;
21388 uint64_t locPageSize = initPageSize;
21397 axis->specializationConstants.inputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (
double)storageComplexSize);
21398 axis->specializationConstants.inputBufferBlockSize = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (
double)(
axis->specializationConstants.inputBufferBlockSize * storageComplexSize));
21403 uint64_t totalSize = 0;
21404 uint64_t locPageSize = initPageSize;
21414 axis->specializationConstants.inputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (
double)storageComplexSize);
21415 axis->specializationConstants.inputBufferBlockSize = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (
double)(
axis->specializationConstants.inputBufferBlockSize * storageComplexSize));
21423 locBufferSize = -1;
21427 uint64_t totalSize = 0;
21428 uint64_t locPageSize = initPageSize;
21437 axis->specializationConstants.outputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (
double)storageComplexSize);
21438 axis->specializationConstants.outputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (
double)(
axis->specializationConstants.outputBufferBlockSize * storageComplexSize));
21443 uint64_t totalSize = 0;
21444 uint64_t locPageSize = initPageSize;
21454 axis->specializationConstants.outputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (
double)storageComplexSize);
21455 axis->specializationConstants.outputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (
double)(
axis->specializationConstants.outputBufferBlockSize * storageComplexSize));
21474 uint64_t totalSize = 0;
21475 uint64_t locPageSize = initPageSize;
21484 axis->specializationConstants.outputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (
double)storageComplexSize);
21485 axis->specializationConstants.outputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (
double)(
axis->specializationConstants.outputBufferBlockSize * storageComplexSize));
21490 uint64_t totalSize = 0;
21491 uint64_t locPageSize = initPageSize;
21501 axis->specializationConstants.outputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (
double)storageComplexSize);
21502 axis->specializationConstants.outputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (
double)(
axis->specializationConstants.outputBufferBlockSize * storageComplexSize));
21509 if (
axis->specializationConstants.inputBufferBlockNum == 0)
axis->specializationConstants.inputBufferBlockNum = 1;
21510 if (
axis->specializationConstants.outputBufferBlockNum == 0)
axis->specializationConstants.outputBufferBlockNum = 1;
21513 uint64_t totalSize = 0;
21514 uint64_t locPageSize = initPageSize;
21521 axis->specializationConstants.kernelBlockSize = (uint64_t)ceil(locPageSize / (
double)storageComplexSize);
21522 axis->specializationConstants.kernelBlockNum = (uint64_t)ceil(totalSize / (
double)(
axis->specializationConstants.kernelBlockSize * storageComplexSize));
21524 if (
axis->specializationConstants.kernelBlockNum == 0)
axis->specializationConstants.kernelBlockNum = 1;
21527 axis->specializationConstants.kernelBlockSize = 0;
21528 axis->specializationConstants.kernelBlockNum = 0;
21530 axis->numBindings = 2;
21531 axis->specializationConstants.numBuffersBound[0] =
axis->specializationConstants.inputBufferBlockNum;
21532 axis->specializationConstants.numBuffersBound[1] =
axis->specializationConstants.outputBufferBlockNum;
21533 axis->specializationConstants.numBuffersBound[2] = 0;
21534 axis->specializationConstants.numBuffersBound[3] = 0;
21536#if(VKFFT_BACKEND==0)
21538 descriptorPoolSize.descriptorCount = (uint32_t)(
axis->specializationConstants.numBuffersBound[0] +
axis->specializationConstants.numBuffersBound[1]);
21541 axis->specializationConstants.numBuffersBound[
axis->numBindings] =
axis->specializationConstants.kernelBlockNum;
21542#if(VKFFT_BACKEND==0)
21545 axis->numBindings++;
21549 axis->specializationConstants.numBuffersBound[
axis->numBindings] = 1;
21550#if(VKFFT_BACKEND==0)
21553 axis->numBindings++;
21555#if(VKFFT_BACKEND==0)
21561 if (res != VK_SUCCESS) {
21565 const VkDescriptorType descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
21566 VkDescriptorSetLayoutBinding* descriptorSetLayoutBindings;
21567 descriptorSetLayoutBindings = (VkDescriptorSetLayoutBinding*)malloc(
axis->numBindings *
sizeof(VkDescriptorSetLayoutBinding));
21568 if (!descriptorSetLayoutBindings) {
21572 for (uint64_t i = 0; i <
axis->numBindings; ++i) {
21573 descriptorSetLayoutBindings[i].binding = (uint32_t)i;
21574 descriptorSetLayoutBindings[i].descriptorType = descriptorType;
21575 descriptorSetLayoutBindings[i].descriptorCount = (uint32_t)
axis->specializationConstants.numBuffersBound[i];
21576 descriptorSetLayoutBindings[i].stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
21584 if (res != VK_SUCCESS) {
21588 free(descriptorSetLayoutBindings);
21589 descriptorSetLayoutBindings = 0;
21595 if (res != VK_SUCCESS) {
21611#if(VKFFT_BACKEND==0)
21624 if (res != VK_SUCCESS) {
21629 axis->axisBlock[0] = 128;
21631 axis->axisBlock[1] = 1;
21632 axis->axisBlock[2] = 1;
21638 else axis->specializationConstants.performWorkGroupShift[0] = 0;
21640 else axis->specializationConstants.performWorkGroupShift[1] = 0;
21642 else axis->specializationConstants.performWorkGroupShift[2] = 0;
21644 axis->specializationConstants.localSize[0] =
axis->axisBlock[0];
21645 axis->specializationConstants.localSize[1] =
axis->axisBlock[1];
21646 axis->specializationConstants.localSize[2] =
axis->axisBlock[2];
21663 axis->specializationConstants.axis_id = 0;
21664 axis->specializationConstants.axis_upload_id = 0;
21666 for (uint64_t i = 0; i < 3; i++) {
21679 axis->specializationConstants.zeropad[0] = 0;
21686 axis->specializationConstants.zeropad[1] = 0;
21695 axis->specializationConstants.zeropad[0] = 0;
21702 axis->specializationConstants.zeropad[1] = 0;
21705 axis->specializationConstants.convolutionStep = 1;
21708 axis->specializationConstants.convolutionStep = 0;
21709 char floatTypeInputMemory[10];
21710 char floatTypeOutputMemory[10];
21711 char floatTypeKernelMemory[10];
21712 char floatType[10];
21713 axis->specializationConstants.unroll = 1;
21716 sprintf(floatType,
"double");
21717 sprintf(floatTypeInputMemory,
"double");
21718 sprintf(floatTypeOutputMemory,
"double");
21719 sprintf(floatTypeKernelMemory,
"double");
21725 sprintf(floatType,
"float");
21728 sprintf(floatTypeInputMemory,
"float");
21729 sprintf(floatTypeOutputMemory,
"float");
21730 sprintf(floatTypeKernelMemory,
"float");
21733 sprintf(floatTypeInputMemory,
"half");
21734 sprintf(floatTypeOutputMemory,
"half");
21735 sprintf(floatTypeKernelMemory,
"half");
21741 sprintf(floatType,
"double");
21742 sprintf(floatTypeInputMemory,
"float");
21743 sprintf(floatTypeOutputMemory,
"float");
21744 sprintf(floatTypeKernelMemory,
"float");
21747 sprintf(floatType,
"float");
21748 sprintf(floatTypeInputMemory,
"float");
21749 sprintf(floatTypeOutputMemory,
"float");
21750 sprintf(floatTypeKernelMemory,
"float");
21754 char uintType[20] =
"";
21756#if(VKFFT_BACKEND==0)
21757 sprintf(uintType,
"uint");
21758#elif(VKFFT_BACKEND==1)
21759 sprintf(uintType,
"unsigned int");
21760#elif(VKFFT_BACKEND==2)
21761 sprintf(uintType,
"unsigned int");
21762#elif(VKFFT_BACKEND==3)
21763 sprintf(uintType,
"unsigned int");
21767#if(VKFFT_BACKEND==0)
21768 sprintf(uintType,
"uint64_t");
21769#elif(VKFFT_BACKEND==1)
21770 sprintf(uintType,
"unsigned long long");
21771#elif(VKFFT_BACKEND==2)
21772 sprintf(uintType,
"unsigned long long");
21773#elif(VKFFT_BACKEND==3)
21774 sprintf(uintType,
"unsigned long");
21783 char* code0 =
axis->specializationConstants.code0;
21788 resFFT =
shaderGenVkFFT_R2C_decomposition(code0, &
axis->specializationConstants, floatType, floatTypeInputMemory, floatTypeOutputMemory, floatTypeKernelMemory, uintType, type);
21794#if(VKFFT_BACKEND==0)
21795 const glslang_resource_t default_resource = {
21901 glslang_target_client_version_t client_version = (app->
configuration.
halfPrecision) ? GLSLANG_TARGET_VULKAN_1_1 : GLSLANG_TARGET_VULKAN_1_0;
21902 glslang_target_language_version_t target_language_version = (app->
configuration.
halfPrecision) ? GLSLANG_TARGET_SPV_1_3 : GLSLANG_TARGET_SPV_1_0;
21903 const glslang_input_t input =
21905 GLSLANG_SOURCE_GLSL,
21906 GLSLANG_STAGE_COMPUTE,
21907 GLSLANG_CLIENT_VULKAN,
21909 GLSLANG_TARGET_SPV,
21910 target_language_version,
21913 GLSLANG_NO_PROFILE,
21916 GLSLANG_MSG_DEFAULT_BIT,
21920 glslang_shader_t* shader = glslang_shader_create(&input);
21922 if (!glslang_shader_preprocess(shader, &input))
21924 err = glslang_shader_get_info_log(shader);
21925 printf(
"%s\n", code0);
21926 printf(
"%s\nVkFFT shader type: %" PRIu64
"\n", err, type);
21927 glslang_shader_delete(shader);
21935 if (!glslang_shader_parse(shader, &input))
21937 err = glslang_shader_get_info_log(shader);
21938 printf(
"%s\n", code0);
21939 printf(
"%s\nVkFFT shader type: %" PRIu64
"\n", err, type);
21940 glslang_shader_delete(shader);
21947 glslang_program_t* program = glslang_program_create();
21948 glslang_program_add_shader(program, shader);
21949 if (!glslang_program_link(program, GLSLANG_MSG_SPV_RULES_BIT | GLSLANG_MSG_VULKAN_RULES_BIT))
21951 err = glslang_program_get_info_log(program);
21952 printf(
"%s\n", code0);
21953 printf(
"%s\nVkFFT shader type: %" PRIu64
"\n", err, type);
21954 glslang_shader_delete(shader);
21955 glslang_program_delete(program);
21978 glslang_shader_delete(shader);
21979 VkPipelineShaderStageCreateInfo pipelineShaderStageCreateInfo = { VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO };
21981 pipelineShaderStageCreateInfo.stage = VK_SHADER_STAGE_COMPUTE_BIT;
21982 VkShaderModuleCreateInfo createInfo = { VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO };
21986 res = vkCreateShaderModule(app->
configuration.
device[0], &createInfo, 0, &pipelineShaderStageCreateInfo.module);
21987 if (res != VK_SUCCESS) {
21988 glslang_program_delete(program);
21994 pipelineShaderStageCreateInfo.pName =
"main";
21995 pipelineShaderStageCreateInfo.pSpecializationInfo = 0;
21999 if (res != VK_SUCCESS) {
22003 vkDestroyShaderModule(app->
configuration.
device[0], pipelineShaderStageCreateInfo.module, 0);
22004 glslang_program_delete(program);
22005#elif(VKFFT_BACKEND==1)
22007 nvrtcResult result = nvrtcCreateProgram(&prog,
22015 if (result != NVRTC_SUCCESS) {
22016 printf(
"nvrtcCreateProgram error: %s\n", nvrtcGetErrorString(result));
22025 result = nvrtcCompileProgram(prog,
22028 if (result != NVRTC_SUCCESS) {
22029 printf(
"nvrtcCompileProgram error: %s\n", nvrtcGetErrorString(result));
22030 char*
log = (
char*)malloc(
sizeof(
char) * 1000000);
22038 nvrtcGetProgramLog(prog,
log);
22039 printf(
"%s\n",
log);
22042 printf(
"%s\n", code0);
22050 result = nvrtcGetPTXSize(prog, &ptxSize);
22051 if (result != NVRTC_SUCCESS) {
22052 printf(
"nvrtcGetPTXSize error: %s\n", nvrtcGetErrorString(result));
22058 char* ptx = (
char*)malloc(ptxSize);
22065 result = nvrtcGetPTX(prog, ptx);
22066 if (result != NVRTC_SUCCESS) {
22067 printf(
"nvrtcGetPTX error: %s\n", nvrtcGetErrorString(result));
22075 result = nvrtcDestroyProgram(&prog);
22076 if (result != NVRTC_SUCCESS) {
22077 printf(
"nvrtcDestroyProgram error: %s\n", nvrtcGetErrorString(result));
22086 CUresult result2 = cuModuleLoadDataEx(&
axis->VkFFTModule, ptx, 0, 0, 0);
22088 if (result2 != CUDA_SUCCESS) {
22089 printf(
"cuModuleLoadDataEx error: %d\n", result2);
22097 result2 = cuModuleGetFunction(&
axis->VkFFTKernel,
axis->VkFFTModule,
"VkFFT_main_R2C");
22098 if (result2 != CUDA_SUCCESS) {
22099 printf(
"cuModuleGetFunction error: %d\n", result2);
22108 result2 = cuFuncSetAttribute(
axis->VkFFTKernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, (
int)
axis->specializationConstants.usedSharedMemory);
22109 if (result2 != CUDA_SUCCESS) {
22110 printf(
"cuFuncSetAttribute error: %d\n", result2);
22120 result2 = cuModuleGetGlobal(&
axis->consts_addr, &size,
axis->VkFFTModule,
"consts");
22121 if (result2 != CUDA_SUCCESS) {
22122 printf(
"cuModuleGetGlobal error: %d\n", result2);
22132#elif(VKFFT_BACKEND==2)
22133 hiprtcProgram prog;
22138 enum hiprtcResult result = hiprtcCreateProgram(&prog,
22144 if (result != HIPRTC_SUCCESS) {
22145 printf(
"hiprtcCreateProgram error: %s\n", hiprtcGetErrorString(result));
22152 result = hiprtcAddNameExpression(prog,
"&consts");
22153 if (result != HIPRTC_SUCCESS) {
22154 printf(
"hiprtcAddNameExpression error: %s\n", hiprtcGetErrorString(result));
22161 result = hiprtcCompileProgram(prog,
22164 if (result != HIPRTC_SUCCESS) {
22165 printf(
"hiprtcCompileProgram error: %s\n", hiprtcGetErrorString(result));
22166 char*
log = (
char*)malloc(
sizeof(
char) * 100000);
22174 hiprtcGetProgramLog(prog,
log);
22175 printf(
"%s\n",
log);
22178 printf(
"%s\n", code0);
22186 result = hiprtcGetCodeSize(prog, &codeSize);
22187 if (result != HIPRTC_SUCCESS) {
22188 printf(
"hiprtcGetCodeSize error: %s\n", hiprtcGetErrorString(result));
22194 char* code = (
char*)malloc(codeSize);
22201 result = hiprtcGetCode(prog, code);
22202 if (result != HIPRTC_SUCCESS) {
22203 printf(
"hiprtcGetCode error: %s\n", hiprtcGetErrorString(result));
22213 result = hiprtcDestroyProgram(&prog);
22214 if (result != HIPRTC_SUCCESS) {
22215 printf(
"hiprtcDestroyProgram error: %s\n", hiprtcGetErrorString(result));
22223 hipError_t result2 = hipModuleLoadDataEx(&
axis->VkFFTModule, code, 0, 0, 0);
22225 if (result2 != hipSuccess) {
22226 printf(
"hipModuleLoadDataEx error: %d\n", result2);
22234 result2 = hipModuleGetFunction(&
axis->VkFFTKernel,
axis->VkFFTModule,
"VkFFT_main_R2C");
22235 if (result2 != hipSuccess) {
22236 printf(
"hipModuleGetFunction error: %d\n", result2);
22245 result2 = hipFuncSetAttribute(
axis->VkFFTKernel, hipFuncAttributeMaxDynamicSharedMemorySize, (
int)
axis->specializationConstants.usedSharedMemory);
22247 if (result2 != hipSuccess) {
22248 printf(
"hipFuncSetAttribute error: %d\n", result2);
22258 result2 = hipModuleGetGlobal(&
axis->consts_addr, &size,
axis->VkFFTModule,
"consts");
22259 if (result2 != hipSuccess) {
22260 printf(
"hipModuleGetGlobal error: %d\n", result2);
22271#elif(VKFFT_BACKEND==3)
22272 size_t codelen = strlen(code0);
22273 axis->program = clCreateProgramWithSource(app->
configuration.context[0], 1, (
const char**)&code0, &codelen, &res);
22274 if (res != CL_SUCCESS) {
22281 if (res != CL_SUCCESS) {
22284 char*
log = (
char*)malloc(log_size);
22293 printf(
"%s\n",
log);
22296 printf(
"%s\n", code0);
22303 axis->kernel = clCreateKernel(
axis->program,
"VkFFT_main_R2C", &res);
22304 if (res != CL_SUCCESS) {
22314 axis->specializationConstants.code0 = 0;
22322#if(VKFFT_BACKEND==0)
22323 VkResult res = VK_SUCCESS;
22324#elif(VKFFT_BACKEND==1)
22325 cudaError_t res = cudaSuccess;
22326#elif(VKFFT_BACKEND==2)
22327 hipError_t res = hipSuccess;
22328#elif(VKFFT_BACKEND==3)
22329 cl_int res = CL_SUCCESS;
22347 uint64_t complexSize;
22349 complexSize = (2 *
sizeof(double));
22352 complexSize = (2 *
sizeof(float));
22354 complexSize = (2 *
sizeof(float));
22355 axis->specializationConstants.complexSize = complexSize;
22356 axis->specializationConstants.supportAxis = 0;
22366 axis->specializationConstants.stageStartSize = 1;
22367 for (uint64_t i = 0; i < axis_upload_id; i++)
22368 axis->specializationConstants.stageStartSize *= FFTPlan->
axisSplit[axis_id][i];
22373 if (axis_id == 0) {
22375 axis->specializationConstants.fft_dim_x =
axis->specializationConstants.stageStartSize;
22381 axis->specializationConstants.useBluesteinFFT = 1;
22385 axis->specializationConstants.actualInverse =
inverse;
22386 axis->specializationConstants.inverse = !
inverse;
22390 axis->specializationConstants.actualInverse =
inverse;
22391 axis->specializationConstants.inverse = 1;
22394 axis->specializationConstants.actualInverse =
inverse;
22399 axis->specializationConstants.actualInverse =
inverse;
22400 axis->specializationConstants.inverse = reverseBluesteinMultiUpload;
22402 axis->specializationConstants.inverseBluestein = !
inverse;
22406 axis->specializationConstants.inverseBluestein = 1;
22409 axis->specializationConstants.inverseBluestein =
inverse;
22413 axis->specializationConstants.reverseBluesteinMultiUpload = reverseBluesteinMultiUpload;
22417 if ((axis_id == 0) && ((FFTPlan->
numAxisUploads[axis_id] == 1) || ((axis_upload_id == 0) && (!
axis->specializationConstants.reorderFourStep)))) {
22418 maxSequenceLengthSharedMemory *=
axis->specializationConstants.registerBoost;
22419 maxSequenceLengthSharedMemoryPow2 = (uint64_t)
pow(2, (uint64_t)log2(maxSequenceLengthSharedMemory));
22422 maxSingleSizeStrided *=
axis->specializationConstants.registerBoost;
22423 maxSingleSizeStridedPow2 = (uint64_t)
pow(2, (uint64_t)log2(maxSingleSizeStrided));
22429 axis->specializationConstants.performDCT = 2;
22441#if(VKFFT_BACKEND==0)
22452#elif(VKFFT_BACKEND==1)
22459 if (res != cudaSuccess) {
22463#elif(VKFFT_BACKEND==2)
22470 if (res != hipSuccess) {
22474#elif(VKFFT_BACKEND==3)
22481 if (res != CL_SUCCESS) {
22489 double double_PI = 3.1415926535897932384626433832795;
22490 uint64_t dimMult = 1;
22491 uint64_t maxStageSum = 0;
22492 for (uint64_t i = 0; i <
axis->specializationConstants.numStages; i++) {
22493 switch (
axis->specializationConstants.stageRadix[i]) {
22495 maxStageSum += dimMult;
22498 maxStageSum += dimMult * 2;
22501 maxStageSum += dimMult * 2;
22504 maxStageSum += dimMult * 4;
22507 maxStageSum += dimMult * 6;
22510 maxStageSum += dimMult * 3;
22513 maxStageSum += dimMult * 10;
22516 maxStageSum += dimMult * 12;
22519 dimMult *=
axis->specializationConstants.stageRadix[i];
22521 axis->specializationConstants.maxStageSumLUT = maxStageSum;
22524 if (axis_upload_id > 0) {
22526 axis->specializationConstants.startDCT3LUT = (maxStageSum +
axis->specializationConstants.stageStartSize *
axis->specializationConstants.fftDim);
22527 axis->bufferLUTSize = (maxStageSum +
axis->specializationConstants.stageStartSize *
axis->specializationConstants.fftDim + (app->
configuration.
size[axis_id] / 2 + 2)) * 2 *
sizeof(double);
22531 axis->specializationConstants.startDCT3LUT = (maxStageSum +
axis->specializationConstants.stageStartSize *
axis->specializationConstants.fftDim);
22532 axis->specializationConstants.startDCT4LUT = (
axis->specializationConstants.startDCT3LUT + (app->
configuration.
size[axis_id] / 4 + 2));
22533 axis->bufferLUTSize = (maxStageSum +
axis->specializationConstants.stageStartSize *
axis->specializationConstants.fftDim + (app->
configuration.
size[axis_id] / 4 + 2) + app->
configuration.
size[axis_id] / 2) * 2 *
sizeof(double);
22536 axis->bufferLUTSize = (maxStageSum +
axis->specializationConstants.stageStartSize *
axis->specializationConstants.fftDim) * 2 *
sizeof(
double);
22541 axis->specializationConstants.startDCT3LUT = (maxStageSum);
22542 axis->bufferLUTSize = (maxStageSum + (app->
configuration.
size[axis_id] / 2 + 2)) * 2 *
sizeof(double);
22546 axis->specializationConstants.startDCT3LUT = (maxStageSum);
22547 axis->specializationConstants.startDCT4LUT = (
axis->specializationConstants.startDCT3LUT + (app->
configuration.
size[axis_id] / 4 + 2));
22552 axis->bufferLUTSize = (maxStageSum) * 2 *
sizeof(
double);
22555 double* tempLUT = (
double*)malloc(
axis->bufferLUTSize);
22560 uint64_t localStageSize = 1;
22561 uint64_t localStageSum = 0;
22562 for (uint64_t i = 0; i <
axis->specializationConstants.numStages; i++) {
22563 if ((
axis->specializationConstants.stageRadix[i] & (
axis->specializationConstants.stageRadix[i] - 1)) == 0) {
22564 for (uint64_t k = 0; k < log2(
axis->specializationConstants.stageRadix[i]); k++) {
22565 for (uint64_t j = 0; j < localStageSize; j++) {
22566 tempLUT[2 * (j + localStageSum)] =
cos(j * double_PI / localStageSize /
pow(2, k));
22567 tempLUT[2 * (j + localStageSum) + 1] =
sin(j * double_PI / localStageSize /
pow(2, k));
22569 localStageSum += localStageSize;
22571 localStageSize *=
axis->specializationConstants.stageRadix[i];
22574 for (uint64_t k = (
axis->specializationConstants.stageRadix[i] - 1); k > 0; k--) {
22575 for (uint64_t j = 0; j < localStageSize; j++) {
22576 tempLUT[2 * (j + localStageSum)] =
cos(j * 2.0 * k /
axis->specializationConstants.stageRadix[i] * double_PI / localStageSize);
22577 tempLUT[2 * (j + localStageSum) + 1] =
sin(j * 2.0 * k /
axis->specializationConstants.stageRadix[i] * double_PI / localStageSize);
22579 localStageSum += localStageSize;
22581 localStageSize *=
axis->specializationConstants.stageRadix[i];
22585 if (axis_upload_id > 0) {
22586 for (uint64_t i = 0; i <
axis->specializationConstants.stageStartSize; i++) {
22587 for (uint64_t j = 0; j <
axis->specializationConstants.fftDim; j++) {
22588 double angle = 2 * double_PI * ((i * j) / (
double)(
axis->specializationConstants.stageStartSize *
axis->specializationConstants.fftDim));
22589 tempLUT[maxStageSum * 2 + 2 * (i + j *
axis->specializationConstants.stageStartSize)] =
cos(
angle);
22590 tempLUT[maxStageSum * 2 + 2 * (i + j *
axis->specializationConstants.stageStartSize) + 1] =
sin(
angle);
22597 tempLUT[2 *
axis->specializationConstants.startDCT3LUT + 2 * j] =
cos(
angle);
22598 tempLUT[2 *
axis->specializationConstants.startDCT3LUT + 2 * j + 1] =
sin(
angle);
22604 tempLUT[2 *
axis->specializationConstants.startDCT3LUT + 2 * j] =
cos(
angle);
22605 tempLUT[2 *
axis->specializationConstants.startDCT3LUT + 2 * j + 1] =
sin(
angle);
22609 tempLUT[2 *
axis->specializationConstants.startDCT4LUT + 2 * j] =
cos(
angle);
22610 tempLUT[2 *
axis->specializationConstants.startDCT4LUT + 2 * j + 1] =
sin(
angle);
22613 axis->referenceLUT = 0;
22614 if (reverseBluesteinMultiUpload == 1) {
22616#if(VKFFT_BACKEND==0)
22620 axis->referenceLUT = 1;
22625#if(VKFFT_BACKEND==0)
22629 axis->referenceLUT = 1;
22634#if(VKFFT_BACKEND==0)
22638 axis->referenceLUT = 1;
22643#if(VKFFT_BACKEND==0)
22647 axis->referenceLUT = 1;
22650#if(VKFFT_BACKEND==0)
22651 resFFT =
allocateFFTBuffer(app, &
axis->bufferLUT, &
axis->bufferLUTDeviceMemory, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, VK_MEMORY_HEAP_DEVICE_LOCAL_BIT,
axis->bufferLUTSize);
22665#elif(VKFFT_BACKEND==1)
22666 res = cudaMalloc((
void**)&
axis->bufferLUT,
axis->bufferLUTSize);
22667 if (res != cudaSuccess) {
22673 res = cudaMemcpy(
axis->bufferLUT, tempLUT,
axis->bufferLUTSize, cudaMemcpyHostToDevice);
22674 if (res != cudaSuccess) {
22680#elif(VKFFT_BACKEND==2)
22681 res = hipMalloc((
void**)&
axis->bufferLUT,
axis->bufferLUTSize);
22682 if (res != hipSuccess) {
22688 res = hipMemcpy(
axis->bufferLUT, tempLUT,
axis->bufferLUTSize, hipMemcpyHostToDevice);
22689 if (res != hipSuccess) {
22695#elif(VKFFT_BACKEND==3)
22696 axis->bufferLUT = clCreateBuffer(app->
configuration.context[0], CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
axis->bufferLUTSize, tempLUT, &res);
22697 if (res != CL_SUCCESS) {
22712 if (axis_upload_id > 0) {
22714 axis->specializationConstants.startDCT3LUT = (maxStageSum +
axis->specializationConstants.stageStartSize *
axis->specializationConstants.fftDim);
22715 axis->bufferLUTSize = (maxStageSum +
axis->specializationConstants.stageStartSize *
axis->specializationConstants.fftDim + (app->
configuration.
size[axis_id] / 2 + 2)) * 2 *
sizeof(float);
22719 axis->specializationConstants.startDCT3LUT = (maxStageSum +
axis->specializationConstants.stageStartSize *
axis->specializationConstants.fftDim);
22720 axis->specializationConstants.startDCT4LUT = (
axis->specializationConstants.startDCT3LUT + (
axis->specializationConstants.fftDim / 4 + 2));
22721 axis->bufferLUTSize = (maxStageSum +
axis->specializationConstants.stageStartSize *
axis->specializationConstants.fftDim + (app->
configuration.
size[axis_id] / 4 + 2) + app->
configuration.
size[axis_id] / 2) * 2 *
sizeof(float);
22724 axis->bufferLUTSize = (maxStageSum +
axis->specializationConstants.stageStartSize *
axis->specializationConstants.fftDim) * 2 *
sizeof(
float);
22729 axis->specializationConstants.startDCT3LUT = (maxStageSum);
22730 axis->bufferLUTSize = (maxStageSum + (app->
configuration.
size[axis_id] / 2 + 2)) * 2 *
sizeof(float);
22734 axis->specializationConstants.startDCT3LUT = (maxStageSum);
22735 axis->specializationConstants.startDCT4LUT = (
axis->specializationConstants.startDCT3LUT + (app->
configuration.
size[axis_id] / 4 + 2));
22739 axis->bufferLUTSize = (maxStageSum) * 2 *
sizeof(
float);
22742 float* tempLUT = (
float*)malloc(
axis->bufferLUTSize);
22747 uint64_t localStageSize = 1;
22748 uint64_t localStageSum = 0;
22749 for (uint64_t i = 0; i <
axis->specializationConstants.numStages; i++) {
22750 if ((
axis->specializationConstants.stageRadix[i] & (
axis->specializationConstants.stageRadix[i] - 1)) == 0) {
22751 for (uint64_t k = 0; k < log2(
axis->specializationConstants.stageRadix[i]); k++) {
22752 for (uint64_t j = 0; j < localStageSize; j++) {
22753 tempLUT[2 * (j + localStageSum)] = (
float)
cos(j * double_PI / localStageSize /
pow(2, k));
22754 tempLUT[2 * (j + localStageSum) + 1] = (
float)
sin(j * double_PI / localStageSize /
pow(2, k));
22756 localStageSum += localStageSize;
22758 localStageSize *=
axis->specializationConstants.stageRadix[i];
22761 for (uint64_t k = (
axis->specializationConstants.stageRadix[i] - 1); k > 0; k--) {
22762 for (uint64_t j = 0; j < localStageSize; j++) {
22763 tempLUT[2 * (j + localStageSum)] = (
float)
cos(j * 2.0 * k /
axis->specializationConstants.stageRadix[i] * double_PI / localStageSize);
22764 tempLUT[2 * (j + localStageSum) + 1] = (
float)
sin(j * 2.0 * k /
axis->specializationConstants.stageRadix[i] * double_PI / localStageSize);
22766 localStageSum += localStageSize;
22768 localStageSize *=
axis->specializationConstants.stageRadix[i];
22772 if (axis_upload_id > 0) {
22773 for (uint64_t i = 0; i <
axis->specializationConstants.stageStartSize; i++) {
22774 for (uint64_t j = 0; j <
axis->specializationConstants.fftDim; j++) {
22775 double angle = 2 * double_PI * ((i * j) / (
double)(
axis->specializationConstants.stageStartSize *
axis->specializationConstants.fftDim));
22776 tempLUT[maxStageSum * 2 + 2 * (i + j *
axis->specializationConstants.stageStartSize)] = (
float)
cos(
angle);
22777 tempLUT[maxStageSum * 2 + 2 * (i + j *
axis->specializationConstants.stageStartSize) + 1] = (
float)
sin(
angle);
22784 tempLUT[2 *
axis->specializationConstants.startDCT3LUT + 2 * j] = (float)
cos(
angle);
22785 tempLUT[2 *
axis->specializationConstants.startDCT3LUT + 2 * j + 1] = (float)
sin(
angle);
22791 tempLUT[2 *
axis->specializationConstants.startDCT3LUT + 2 * j] = (float)
cos(
angle);
22792 tempLUT[2 *
axis->specializationConstants.startDCT3LUT + 2 * j + 1] = (float)
sin(
angle);
22796 tempLUT[2 *
axis->specializationConstants.startDCT4LUT + 2 * j] = (float)
cos(
angle);
22797 tempLUT[2 *
axis->specializationConstants.startDCT4LUT + 2 * j + 1] = (float)
sin(
angle);
22800 axis->referenceLUT = 0;
22801 if (reverseBluesteinMultiUpload == 1) {
22803#if(VKFFT_BACKEND==0)
22807 axis->referenceLUT = 1;
22812#if(VKFFT_BACKEND==0)
22816 axis->referenceLUT = 1;
22821#if(VKFFT_BACKEND==0)
22825 axis->referenceLUT = 1;
22830#if(VKFFT_BACKEND==0)
22834 axis->referenceLUT = 1;
22837#if(VKFFT_BACKEND==0)
22838 resFFT =
allocateFFTBuffer(app, &
axis->bufferLUT, &
axis->bufferLUTDeviceMemory, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, VK_MEMORY_HEAP_DEVICE_LOCAL_BIT,
axis->bufferLUTSize);
22852#elif(VKFFT_BACKEND==1)
22853 res = cudaMalloc((
void**)&
axis->bufferLUT,
axis->bufferLUTSize);
22854 if (res != cudaSuccess) {
22860 res = cudaMemcpy(
axis->bufferLUT, tempLUT,
axis->bufferLUTSize, cudaMemcpyHostToDevice);
22861 if (res != cudaSuccess) {
22867#elif(VKFFT_BACKEND==2)
22868 res = hipMalloc((
void**)&
axis->bufferLUT,
axis->bufferLUTSize);
22869 if (res != hipSuccess) {
22875 res = hipMemcpy(
axis->bufferLUT, tempLUT,
axis->bufferLUTSize, hipMemcpyHostToDevice);
22876 if (res != hipSuccess) {
22882#elif(VKFFT_BACKEND==3)
22883 axis->bufferLUT = clCreateBuffer(app->
configuration.context[0], CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
axis->bufferLUTSize, tempLUT, &res);
22884 if (res != CL_SUCCESS) {
22902 uint64_t* axisStride =
axis->specializationConstants.inputStride;
22909 if (axis_id == 0) {
22910 axisStride[1] = usedStride[0];
22911 axisStride[2] = usedStride[1];
22915 axisStride[1] = usedStride[0];
22916 axisStride[2] = usedStride[1];
22920 axisStride[1] = usedStride[1];
22921 axisStride[2] = usedStride[0];
22924 axisStride[3] = usedStride[2];
22930 if (axis_id == 0) {
22950 axisStride[1] *= 2;
22951 axisStride[2] *= 2;
22952 axisStride[3] *= 2;
22953 axisStride[4] *= 2;
22956 for (uint64_t i = 1; i < 5; i++) {
22957 axisStride[i] /= 2;
22960 axisStride =
axis->specializationConstants.outputStride;
22968 if (axis_id == 0) {
22969 axisStride[1] = usedStride[0];
22970 axisStride[2] = usedStride[1];
22974 axisStride[1] = usedStride[0];
22975 axisStride[2] = usedStride[1];
22979 axisStride[1] = usedStride[1];
22980 axisStride[2] = usedStride[0];
22983 axisStride[3] = usedStride[2];
22989 if (axis_id == 0) {
23009 axisStride[1] *= 2;
23010 axisStride[2] *= 2;
23011 axisStride[3] *= 2;
23012 axisStride[4] *= 2;
23015 for (uint64_t i = 1; i < 5; i++) {
23016 axisStride[i] /= 2;
23028 uint64_t storageComplexSize;
23030 storageComplexSize = (2 *
sizeof(double));
23033 storageComplexSize = (2 * 2);
23035 storageComplexSize = (2 *
sizeof(float));
23037 uint64_t initPageSize = -1;
23038 uint64_t locBufferNum = 1;
23039 uint64_t locBufferSize = -1;
23070 uint64_t totalSize = 0;
23071 uint64_t locPageSize = initPageSize;
23080 axis->specializationConstants.inputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (
double)storageComplexSize);
23081 axis->specializationConstants.inputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (
double)(
axis->specializationConstants.inputBufferBlockSize * storageComplexSize));
23087 uint64_t totalSize = 0;
23088 uint64_t locPageSize = initPageSize;
23097 axis->specializationConstants.inputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (
double)storageComplexSize);
23098 axis->specializationConstants.inputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (
double)(
axis->specializationConstants.inputBufferBlockSize * storageComplexSize));
23103 uint64_t totalSize = 0;
23104 uint64_t locPageSize = initPageSize;
23106 if (((
axis->specializationConstants.reorderFourStep == 1) && (axis_upload_id > 0)) || (app->
useBluesteinFFT[axis_id] && (reverseBluesteinMultiUpload == 0) && (axis_upload_id == FFTPlan->
numAxisUploads[axis_id] - 1))) {
23141 axis->specializationConstants.inputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (
double)storageComplexSize);
23142 axis->specializationConstants.inputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (
double)(
axis->specializationConstants.inputBufferBlockSize * storageComplexSize));
23149 locBufferSize = -1;
23163 uint64_t totalSize = 0;
23164 uint64_t locPageSize = initPageSize;
23173 axis->specializationConstants.outputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (
double)storageComplexSize);
23174 axis->specializationConstants.outputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (
double)(
axis->specializationConstants.outputBufferBlockSize * storageComplexSize));
23179 uint64_t totalSize = 0;
23180 uint64_t locPageSize = initPageSize;
23182 if (((
axis->specializationConstants.reorderFourStep == 1) && (axis_upload_id == 1)) || (app->
useBluesteinFFT[axis_id] && (!((axis_upload_id == FFTPlan->
numAxisUploads[axis_id] - 1) && (
axis->specializationConstants.reverseBluesteinMultiUpload == 1))))) {
23213 axis->specializationConstants.outputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (
double)storageComplexSize);
23214 axis->specializationConstants.outputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (
double)(
axis->specializationConstants.outputBufferBlockSize * storageComplexSize));
23218 if (
axis->specializationConstants.inputBufferBlockNum == 0)
axis->specializationConstants.inputBufferBlockNum = 1;
23219 if (
axis->specializationConstants.outputBufferBlockNum == 0)
axis->specializationConstants.outputBufferBlockNum = 1;
23221 uint64_t totalSize = 0;
23222 uint64_t locPageSize = initPageSize;
23231 axis->specializationConstants.kernelBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (
double)storageComplexSize);
23232 axis->specializationConstants.kernelBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (
double)(
axis->specializationConstants.kernelBlockSize * storageComplexSize));
23234 if (
axis->specializationConstants.kernelBlockNum == 0)
axis->specializationConstants.kernelBlockNum = 1;
23237 axis->specializationConstants.kernelBlockSize = 0;
23238 axis->specializationConstants.kernelBlockNum = 0;
23240 axis->numBindings = 2;
23241 axis->specializationConstants.numBuffersBound[0] =
axis->specializationConstants.inputBufferBlockNum;
23242 axis->specializationConstants.numBuffersBound[1] =
axis->specializationConstants.outputBufferBlockNum;
23243 axis->specializationConstants.numBuffersBound[2] = 0;
23244 axis->specializationConstants.numBuffersBound[3] = 0;
23245#if(VKFFT_BACKEND==0)
23247 descriptorPoolSize.descriptorCount = (uint32_t)(
axis->specializationConstants.inputBufferBlockNum +
axis->specializationConstants.outputBufferBlockNum);
23249 axis->specializationConstants.convolutionBindingID = -1;
23251 axis->specializationConstants.convolutionBindingID =
axis->numBindings;
23252 axis->specializationConstants.numBuffersBound[
axis->numBindings] =
axis->specializationConstants.kernelBlockNum;
23253#if(VKFFT_BACKEND==0)
23256 axis->numBindings++;
23259 axis->specializationConstants.convolutionBindingID =
axis->numBindings;
23260 axis->specializationConstants.numBuffersBound[
axis->numBindings] =
axis->specializationConstants.kernelBlockNum;
23261#if(VKFFT_BACKEND==0)
23264 axis->numBindings++;
23267 axis->specializationConstants.convolutionBindingID =
axis->numBindings;
23268 axis->specializationConstants.numBuffersBound[
axis->numBindings] =
axis->specializationConstants.kernelBlockNum;
23269#if(VKFFT_BACKEND==0)
23272 axis->numBindings++;
23275 axis->specializationConstants.LUTBindingID =
axis->numBindings;
23276 axis->specializationConstants.numBuffersBound[
axis->numBindings] = 1;
23277#if(VKFFT_BACKEND==0)
23280 axis->numBindings++;
23283 if (
axis->specializationConstants.inverseBluestein)
23287 axis->specializationConstants.BluesteinConvolutionBindingID =
axis->numBindings;
23288 axis->specializationConstants.numBuffersBound[
axis->numBindings] = 1;
23289#if(VKFFT_BACKEND==0)
23292 axis->numBindings++;
23296 axis->specializationConstants.BluesteinMultiplicationBindingID =
axis->numBindings;
23297 axis->specializationConstants.numBuffersBound[
axis->numBindings] = 1;
23298#if(VKFFT_BACKEND==0)
23301 axis->numBindings++;
23303#if(VKFFT_BACKEND==0)
23309 if (res != VK_SUCCESS) {
23313 const VkDescriptorType descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
23314 VkDescriptorSetLayoutBinding* descriptorSetLayoutBindings;
23315 descriptorSetLayoutBindings = (VkDescriptorSetLayoutBinding*)malloc(
axis->numBindings *
sizeof(VkDescriptorSetLayoutBinding));
23316 if (!descriptorSetLayoutBindings) {
23320 for (uint64_t i = 0; i <
axis->numBindings; ++i) {
23321 descriptorSetLayoutBindings[i].binding = (uint32_t)i;
23322 descriptorSetLayoutBindings[i].descriptorType = descriptorType;
23323 descriptorSetLayoutBindings[i].descriptorCount = (uint32_t)
axis->specializationConstants.numBuffersBound[i];
23324 descriptorSetLayoutBindings[i].stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
23332 if (res != VK_SUCCESS) {
23336 free(descriptorSetLayoutBindings);
23337 descriptorSetLayoutBindings = 0;
23343 if (res != VK_SUCCESS) {
23359#if(VKFFT_BACKEND==0)
23372 if (res != VK_SUCCESS) {
23378 axis->groupedBatch = maxBatchCoalesced;
23394 if (((FFTPlan->
numAxisUploads[axis_id] == 1) && (axis_id == 0)) || ((axis_id == 0) && (!
axis->specializationConstants.reorderFourStep) && (axis_upload_id == 0))) {
23395 axis->groupedBatch = (maxSequenceLengthSharedMemoryPow2 /
axis->specializationConstants.fftDim >
axis->groupedBatch) ? maxSequenceLengthSharedMemoryPow2 /
axis->specializationConstants.fftDim :
axis->groupedBatch;
23398 axis->groupedBatch = (maxSingleSizeStridedPow2 /
axis->specializationConstants.fftDim > 1) ? maxSingleSizeStridedPow2 /
axis->specializationConstants.fftDim *
axis->groupedBatch :
axis->groupedBatch;
23403 if ((FFTPlan->
numAxisUploads[axis_id] == 2) && (axis_upload_id == 0) && (
axis->specializationConstants.fftDim * maxBatchCoalesced <= maxSequenceLengthSharedMemory)) {
23404 axis->groupedBatch = (uint64_t)ceil(
axis->groupedBatch / 2.0);
23407 if ((FFTPlan->
numAxisUploads[axis_id] == 3) && (axis_upload_id == 0) && (
axis->specializationConstants.fftDim < maxSequenceLengthSharedMemory / (2 * complexSize))) {
23408 axis->groupedBatch = (uint64_t)ceil(
axis->groupedBatch / 2.0);
23410 if (
axis->groupedBatch < maxBatchCoalesced)
axis->groupedBatch = maxBatchCoalesced;
23411 axis->groupedBatch = (
axis->groupedBatch / maxBatchCoalesced) * maxBatchCoalesced;
23413 if (!((axis_id == 0) && (FFTPlan->
numAxisUploads[axis_id] == 1)) && !((axis_id == 0) && (axis_upload_id == 0) && (!
axis->specializationConstants.reorderFourStep)) && (
axis->specializationConstants.fftDim > maxSingleSizeStrided)) {
23414 axis->groupedBatch = (uint64_t)ceil(
axis->groupedBatch / 2.0);
23418 axis->groupedBatch = (uint64_t)ceil(
axis->groupedBatch / 2.0);
23420 if (
axis->groupedBatch > 2 * maxBatchCoalesced)
axis->groupedBatch = (
axis->groupedBatch / (2 * maxBatchCoalesced)) * (2 * maxBatchCoalesced);
23421 if (
axis->groupedBatch > 4 * maxBatchCoalesced)
axis->groupedBatch = (
axis->groupedBatch / (4 * maxBatchCoalesced)) * (2 * maxBatchCoalesced);
23422 uint64_t maxThreadNum = maxSequenceLengthSharedMemory / (
axis->specializationConstants.min_registers_per_thread *
axis->specializationConstants.registerBoost);
23424 axis->specializationConstants.axisSwapped = 0;
23425 uint64_t r2cmult = (
axis->specializationConstants.mergeSequencesR2C) ? 2 : 1;
23426 if (axis_id == 0) {
23428 if (axis_upload_id == 0) {
23429 axis->axisBlock[0] = (
axis->specializationConstants.fftDim /
axis->specializationConstants.min_registers_per_thread /
axis->specializationConstants.registerBoost > 1) ?
axis->specializationConstants.fftDim /
axis->specializationConstants.min_registers_per_thread /
axis->specializationConstants.registerBoost : 1;
23430 if (
axis->axisBlock[0] > maxThreadNum)
axis->axisBlock[0] = maxThreadNum;
23432 if (
axis->specializationConstants.reorderFourStep && (FFTPlan->
numAxisUploads[axis_id] > 1))
23433 axis->axisBlock[1] =
axis->groupedBatch;
23438 uint64_t currentAxisBlock1 =
axis->axisBlock[1];
23439 for (uint64_t i = currentAxisBlock1; i < 2 * currentAxisBlock1; i++) {
23441 if (i *
axis->specializationConstants.fftDim * complexSize <= app->configuration.sharedMemorySize)
axis->axisBlock[1] = i;
23442 i = 2 * currentAxisBlock1;
23447 if ((
axis->specializationConstants.mergeSequencesR2C != 0) && (
axis->specializationConstants.fftDim *
axis->axisBlock[1] >= maxSequenceLengthSharedMemory)) {
23448 axis->specializationConstants.mergeSequencesR2C = 0;
23467 if (
axis->axisBlock[0] *
axis->axisBlock[1] > maxThreadNum) {
23468 for (uint64_t i = 1; i <=
axis->axisBlock[1]; i++) {
23469 if ((
axis->axisBlock[1] / i) *
axis->axisBlock[0] <= maxThreadNum)
23471 axis->axisBlock[1] /= i;
23472 i =
axis->axisBlock[1] + 1;
23477 while ((
axis->axisBlock[1] * (
axis->specializationConstants.fftDim /
axis->specializationConstants.registerBoost)) > maxSequenceLengthSharedMemory)
axis->axisBlock[1] /= 2;
23479#if (VKFFT_BACKEND==0)
23480 if (((
axis->specializationConstants.fftDim & (
axis->specializationConstants.fftDim - 1)) != 0)) {
23481 uint64_t temp =
axis->axisBlock[1];
23482 axis->axisBlock[1] =
axis->axisBlock[0];
23483 axis->axisBlock[0] = temp;
23484 axis->specializationConstants.axisSwapped = 1;
23487 uint64_t temp =
axis->axisBlock[1];
23488 axis->axisBlock[1] =
axis->axisBlock[0];
23489 axis->axisBlock[0] = temp;
23490 axis->specializationConstants.axisSwapped = 1;
23493 axis->axisBlock[2] = 1;
23494 axis->axisBlock[3] =
axis->specializationConstants.fftDim;
23497 axis->axisBlock[1] = (
axis->specializationConstants.fftDim /
axis->specializationConstants.min_registers_per_thread /
axis->specializationConstants.registerBoost > 1) ?
axis->specializationConstants.fftDim /
axis->specializationConstants.min_registers_per_thread /
axis->specializationConstants.registerBoost : 1;
23500 axis->axisBlock[0] = (
axis->specializationConstants.stageStartSize >
axis->groupedBatch) ?
axis->groupedBatch :
axis->specializationConstants.stageStartSize;
23502 if (
axis->axisBlock[0] *
axis->axisBlock[1] > maxThreadNum) {
23503 for (uint64_t i = 1; i <=
axis->axisBlock[0]; i++) {
23504 if ((
axis->axisBlock[0] / i) *
axis->axisBlock[1] <= maxThreadNum)
23506 axis->axisBlock[0] /= i;
23507 i =
axis->axisBlock[0] + 1;
23512 axis->axisBlock[2] = 1;
23513 axis->axisBlock[3] =
axis->specializationConstants.fftDim;
23517 if (axis_id == 1) {
23519 axis->axisBlock[1] = (
axis->specializationConstants.fftDim /
axis->specializationConstants.min_registers_per_thread /
axis->specializationConstants.registerBoost > 1) ?
axis->specializationConstants.fftDim /
axis->specializationConstants.min_registers_per_thread /
axis->specializationConstants.registerBoost : 1;
23523 if (
axis->axisBlock[0] *
axis->axisBlock[1] > maxThreadNum) {
23524 for (uint64_t i = 1; i <=
axis->axisBlock[0]; i++) {
23525 if ((
axis->axisBlock[0] / i) *
axis->axisBlock[1] <= maxThreadNum)
23527 axis->axisBlock[0] /= i;
23528 i =
axis->axisBlock[0] + 1;
23533 axis->axisBlock[2] = 1;
23534 axis->axisBlock[3] =
axis->specializationConstants.fftDim;
23537 if (axis_id == 2) {
23538 axis->axisBlock[1] = (
axis->specializationConstants.fftDim /
axis->specializationConstants.min_registers_per_thread /
axis->specializationConstants.registerBoost > 1) ?
axis->specializationConstants.fftDim /
axis->specializationConstants.min_registers_per_thread /
axis->specializationConstants.registerBoost : 1;
23543 if (
axis->axisBlock[0] *
axis->axisBlock[1] > maxThreadNum) {
23544 for (uint64_t i = 1; i <=
axis->axisBlock[0]; i++) {
23545 if ((
axis->axisBlock[0] / i) *
axis->axisBlock[1] <= maxThreadNum)
23547 axis->axisBlock[0] /= i;
23548 i =
axis->axisBlock[0] + 1;
23553 axis->axisBlock[2] = 1;
23554 axis->axisBlock[3] =
axis->specializationConstants.fftDim;
23569 axis->specializationConstants.localSize[0] =
axis->axisBlock[0];
23570 axis->specializationConstants.localSize[1] =
axis->axisBlock[1];
23571 axis->specializationConstants.localSize[2] =
axis->axisBlock[2];
23584 axis->specializationConstants.axis_id = axis_id;
23585 axis->specializationConstants.axis_upload_id = axis_upload_id;
23587 for (uint64_t i = 0; i < 3; i++) {
23593 if (
axis->specializationConstants.useBluesteinFFT && (axis_upload_id == FFTPlan->
numAxisUploads[axis_id] - 1) && ((reverseBluesteinMultiUpload == 0) || (FFTPlan->
numAxisUploads[axis_id] == 1))) {
23594 axis->specializationConstants.zeropadBluestein[0] = 1;
23595 axis->specializationConstants.fft_zeropad_Bluestein_left_read[axis_id] = app->
configuration.
size[axis_id];
23596 if (FFTPlan->
multiUploadR2C)
axis->specializationConstants.fft_zeropad_Bluestein_left_read[axis_id] /= 2;
23597 if (app->
configuration.
performDCT == 1)
axis->specializationConstants.fft_zeropad_Bluestein_left_read[axis_id] = 2 *
axis->specializationConstants.fft_zeropad_Bluestein_left_read[axis_id]-2;
23599 axis->specializationConstants.fft_zeropad_Bluestein_right_read[axis_id] = FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id];
23601 if (
axis->specializationConstants.useBluesteinFFT && (axis_upload_id == FFTPlan->
numAxisUploads[axis_id] - 1) && ((reverseBluesteinMultiUpload == 1) || (FFTPlan->
numAxisUploads[axis_id] == 1))) {
23602 axis->specializationConstants.zeropadBluestein[1] = 1;
23603 axis->specializationConstants.fft_zeropad_Bluestein_left_write[axis_id] = app->
configuration.
size[axis_id];
23604 if (FFTPlan->
multiUploadR2C)
axis->specializationConstants.fft_zeropad_Bluestein_left_write[axis_id] /= 2;
23605 if (app->
configuration.
performDCT == 1)
axis->specializationConstants.fft_zeropad_Bluestein_left_write[axis_id] = 2 *
axis->specializationConstants.fft_zeropad_Bluestein_left_write[axis_id] - 2;
23607 axis->specializationConstants.fft_zeropad_Bluestein_right_write[axis_id] = FFTPlan->
actualFFTSizePerAxis[axis_id][axis_id];
23616 axis->specializationConstants.zeropad[0] = 0;
23623 axis->specializationConstants.zeropad[1] = 0;
23632 axis->specializationConstants.zeropad[0] = 0;
23639 axis->specializationConstants.zeropad[1] = 0;
23642 axis->specializationConstants.convolutionStep = 1;
23645 axis->specializationConstants.convolutionStep = 0;
23647 axis->specializationConstants.BluesteinConvolutionStep = 1;
23649 axis->specializationConstants.BluesteinConvolutionStep = 0;
23652 axis->specializationConstants.BluesteinPreMultiplication = 1;
23654 axis->specializationConstants.BluesteinPreMultiplication = 0;
23656 axis->specializationConstants.BluesteinPostMultiplication = 1;
23658 axis->specializationConstants.BluesteinPostMultiplication = 0;
23664 if (axis_id == 0) {
23665 if (axis_upload_id == 0)
23669 if ((FFTPlan->
actualPerformR2CPerAxis[axis_id] == 1) && (
axis->specializationConstants.mergeSequencesR2C)) tempSize[1] = (uint64_t)ceil(tempSize[1] / 2.0);
23675 else axis->specializationConstants.performWorkGroupShift[0] = 0;
23677 else axis->specializationConstants.performWorkGroupShift[1] = 0;
23679 else axis->specializationConstants.performWorkGroupShift[2] = 0;
23681 if (axis_id == 1) {
23691 else axis->specializationConstants.performWorkGroupShift[0] = 0;
23693 else axis->specializationConstants.performWorkGroupShift[1] = 0;
23695 else axis->specializationConstants.performWorkGroupShift[2] = 0;
23698 if (axis_id == 2) {
23707 else axis->specializationConstants.performWorkGroupShift[0] = 0;
23709 else axis->specializationConstants.performWorkGroupShift[1] = 0;
23711 else axis->specializationConstants.performWorkGroupShift[2] = 0;
23715 char floatTypeInputMemory[10];
23716 char floatTypeOutputMemory[10];
23717 char floatTypeKernelMemory[10];
23718 char floatType[10];
23719 axis->specializationConstants.unroll = 1;
23722 sprintf(floatType,
"double");
23723 sprintf(floatTypeInputMemory,
"double");
23724 sprintf(floatTypeOutputMemory,
"double");
23725 sprintf(floatTypeKernelMemory,
"double");
23731 sprintf(floatType,
"float");
23734 sprintf(floatTypeKernelMemory,
"float");
23735 if ((axis_id == app->
firstAxis) && (axis_upload_id == FFTPlan->
numAxisUploads[axis_id] - 1) && (!
axis->specializationConstants.actualInverse))
23736 sprintf(floatTypeInputMemory,
"half");
23738 sprintf(floatTypeInputMemory,
"float");
23739 if ((axis_id == app->
firstAxis) && (((!
axis->specializationConstants.reorderFourStep) && (axis_upload_id == FFTPlan->
numAxisUploads[axis_id] - 1)) || ((
axis->specializationConstants.reorderFourStep) && (axis_upload_id == 0))) && (
axis->specializationConstants.actualInverse))
23740 sprintf(floatTypeOutputMemory,
"half");
23742 sprintf(floatTypeOutputMemory,
"float");
23745 sprintf(floatTypeInputMemory,
"half");
23746 sprintf(floatTypeOutputMemory,
"half");
23747 sprintf(floatTypeKernelMemory,
"half");
23753 sprintf(floatType,
"double");
23754 sprintf(floatTypeInputMemory,
"float");
23755 sprintf(floatTypeOutputMemory,
"float");
23756 sprintf(floatTypeKernelMemory,
"float");
23759 sprintf(floatType,
"float");
23760 sprintf(floatTypeInputMemory,
"float");
23761 sprintf(floatTypeOutputMemory,
"float");
23762 sprintf(floatTypeKernelMemory,
"float");
23766 char uintType[20] =
"";
23768#if(VKFFT_BACKEND==0)
23769 sprintf(uintType,
"uint");
23770#elif(VKFFT_BACKEND==1)
23771 sprintf(uintType,
"unsigned int");
23772#elif(VKFFT_BACKEND==2)
23773 sprintf(uintType,
"unsigned int");
23774#elif(VKFFT_BACKEND==3)
23775 sprintf(uintType,
"unsigned int");
23779#if(VKFFT_BACKEND==0)
23780 sprintf(uintType,
"uint64_t");
23781#elif(VKFFT_BACKEND==1)
23782 sprintf(uintType,
"unsigned long long");
23783#elif(VKFFT_BACKEND==2)
23784 sprintf(uintType,
"unsigned long long");
23785#elif(VKFFT_BACKEND==3)
23786 sprintf(uintType,
"unsigned long");
23792 if ((axis_id == 0) && (axis_upload_id == 0)) type = 0;
23793 if (axis_id != 0) type = 1;
23794 if ((axis_id == 0) && (axis_upload_id > 0)) type = 2;
23808#if(VKFFT_BACKEND==0)
23810#elif(VKFFT_BACKEND==1)
23811 axis->specializationConstants.cacheShuffle = 0;
23812#elif(VKFFT_BACKEND==2)
23813 axis->specializationConstants.cacheShuffle = 0;
23814#elif(VKFFT_BACKEND==3)
23815 axis->specializationConstants.cacheShuffle = 0;
23821 char* code0 =
axis->specializationConstants.code0;
23826 resFFT =
shaderGenVkFFT(code0, &
axis->specializationConstants, floatType, floatTypeInputMemory, floatTypeOutputMemory, floatTypeKernelMemory, uintType, type);
23832#if(VKFFT_BACKEND==0)
23833 const glslang_resource_t default_resource = {
23939 glslang_target_client_version_t client_version = (app->
configuration.
halfPrecision) ? GLSLANG_TARGET_VULKAN_1_1 : GLSLANG_TARGET_VULKAN_1_0;
23940 glslang_target_language_version_t target_language_version = (app->
configuration.
halfPrecision) ? GLSLANG_TARGET_SPV_1_3 : GLSLANG_TARGET_SPV_1_0;
23941 const glslang_input_t input =
23943 GLSLANG_SOURCE_GLSL,
23944 GLSLANG_STAGE_COMPUTE,
23945 GLSLANG_CLIENT_VULKAN,
23947 GLSLANG_TARGET_SPV,
23948 target_language_version,
23951 GLSLANG_NO_PROFILE,
23954 GLSLANG_MSG_DEFAULT_BIT,
23958 glslang_shader_t* shader = glslang_shader_create(&input);
23960 if (!glslang_shader_preprocess(shader, &input))
23962 err = glslang_shader_get_info_log(shader);
23963 printf(
"%s\n", code0);
23964 printf(
"%s\nVkFFT shader type: %" PRIu64
"\n", err, type);
23965 glslang_shader_delete(shader);
23973 if (!glslang_shader_parse(shader, &input))
23975 err = glslang_shader_get_info_log(shader);
23976 printf(
"%s\n", code0);
23977 printf(
"%s\nVkFFT shader type: %" PRIu64
"\n", err, type);
23978 glslang_shader_delete(shader);
23985 glslang_program_t* program = glslang_program_create();
23986 glslang_program_add_shader(program, shader);
23987 if (!glslang_program_link(program, GLSLANG_MSG_SPV_RULES_BIT | GLSLANG_MSG_VULKAN_RULES_BIT))
23989 err = glslang_program_get_info_log(program);
23990 printf(
"%s\n", code0);
23991 printf(
"%s\nVkFFT shader type: %" PRIu64
"\n", err, type);
23992 glslang_shader_delete(shader);
23993 glslang_program_delete(program);
24016 glslang_shader_delete(shader);
24017 VkPipelineShaderStageCreateInfo pipelineShaderStageCreateInfo = { VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO };
24019 pipelineShaderStageCreateInfo.stage = VK_SHADER_STAGE_COMPUTE_BIT;
24020 VkShaderModuleCreateInfo createInfo = { VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO };
24024 res = vkCreateShaderModule(app->
configuration.
device[0], &createInfo, 0, &pipelineShaderStageCreateInfo.module);
24025 if (res != VK_SUCCESS) {
24026 glslang_program_delete(program);
24032 pipelineShaderStageCreateInfo.pName =
"main";
24033 pipelineShaderStageCreateInfo.pSpecializationInfo = 0;
24037 if (res != VK_SUCCESS) {
24041 vkDestroyShaderModule(app->
configuration.
device[0], pipelineShaderStageCreateInfo.module, 0);
24042 glslang_program_delete(program);
24043#elif(VKFFT_BACKEND==1)
24045 nvrtcResult result = nvrtcCreateProgram(&prog,
24053 if (result != NVRTC_SUCCESS) {
24054 printf(
"nvrtcCreateProgram error: %s\n", nvrtcGetErrorString(result));
24063 result = nvrtcCompileProgram(prog,
24066 if (result != NVRTC_SUCCESS) {
24067 printf(
"nvrtcCompileProgram error: %s\n", nvrtcGetErrorString(result));
24068 char*
log = (
char*)malloc(
sizeof(
char) * 1000000);
24076 nvrtcGetProgramLog(prog,
log);
24077 printf(
"%s\n",
log);
24080 printf(
"%s\n", code0);
24088 result = nvrtcGetPTXSize(prog, &ptxSize);
24089 if (result != NVRTC_SUCCESS) {
24090 printf(
"nvrtcGetPTXSize error: %s\n", nvrtcGetErrorString(result));
24096 char* ptx = (
char*)malloc(ptxSize);
24103 result = nvrtcGetPTX(prog, ptx);
24104 if (result != NVRTC_SUCCESS) {
24105 printf(
"nvrtcGetPTX error: %s\n", nvrtcGetErrorString(result));
24113 result = nvrtcDestroyProgram(&prog);
24114 if (result != NVRTC_SUCCESS) {
24115 printf(
"nvrtcDestroyProgram error: %s\n", nvrtcGetErrorString(result));
24124 CUresult result2 = cuModuleLoadDataEx(&
axis->VkFFTModule, ptx, 0, 0, 0);
24126 if (result2 != CUDA_SUCCESS) {
24127 printf(
"cuModuleLoadDataEx error: %d\n", result2);
24135 result2 = cuModuleGetFunction(&
axis->VkFFTKernel,
axis->VkFFTModule,
"VkFFT_main");
24136 if (result2 != CUDA_SUCCESS) {
24137 printf(
"cuModuleGetFunction error: %d\n", result2);
24146 result2 = cuFuncSetAttribute(
axis->VkFFTKernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, (
int)
axis->specializationConstants.usedSharedMemory);
24147 if (result2 != CUDA_SUCCESS) {
24148 printf(
"cuFuncSetAttribute error: %d\n", result2);
24158 result2 = cuModuleGetGlobal(&
axis->consts_addr, &size,
axis->VkFFTModule,
"consts");
24159 if (result2 != CUDA_SUCCESS) {
24160 printf(
"cuModuleGetGlobal error: %d\n", result2);
24170#elif(VKFFT_BACKEND==2)
24171 hiprtcProgram prog;
24176 enum hiprtcResult result = hiprtcCreateProgram(&prog,
24182 if (result != HIPRTC_SUCCESS) {
24183 printf(
"hiprtcCreateProgram error: %s\n", hiprtcGetErrorString(result));
24190 result = hiprtcAddNameExpression(prog,
"&consts");
24191 if (result != HIPRTC_SUCCESS) {
24192 printf(
"hiprtcAddNameExpression error: %s\n", hiprtcGetErrorString(result));
24199 result = hiprtcCompileProgram(prog,
24202 if (result != HIPRTC_SUCCESS) {
24203 printf(
"hiprtcCompileProgram error: %s\n", hiprtcGetErrorString(result));
24204 char*
log = (
char*)malloc(
sizeof(
char) * 100000);
24212 hiprtcGetProgramLog(prog,
log);
24213 printf(
"%s\n",
log);
24216 printf(
"%s\n", code0);
24224 result = hiprtcGetCodeSize(prog, &codeSize);
24225 if (result != HIPRTC_SUCCESS) {
24226 printf(
"hiprtcGetCodeSize error: %s\n", hiprtcGetErrorString(result));
24232 char* code = (
char*)malloc(codeSize);
24239 result = hiprtcGetCode(prog, code);
24240 if (result != HIPRTC_SUCCESS) {
24241 printf(
"hiprtcGetCode error: %s\n", hiprtcGetErrorString(result));
24251 result = hiprtcDestroyProgram(&prog);
24252 if (result != HIPRTC_SUCCESS) {
24253 printf(
"hiprtcDestroyProgram error: %s\n", hiprtcGetErrorString(result));
24261 hipError_t result2 = hipModuleLoadDataEx(&
axis->VkFFTModule, code, 0, 0, 0);
24263 if (result2 != hipSuccess) {
24264 printf(
"hipModuleLoadDataEx error: %d\n", result2);
24272 result2 = hipModuleGetFunction(&
axis->VkFFTKernel,
axis->VkFFTModule,
"VkFFT_main");
24273 if (result2 != hipSuccess) {
24274 printf(
"hipModuleGetFunction error: %d\n", result2);
24283 result2 = hipFuncSetAttribute(
axis->VkFFTKernel, hipFuncAttributeMaxDynamicSharedMemorySize, (
int)
axis->specializationConstants.usedSharedMemory);
24285 if (result2 != hipSuccess) {
24286 printf(
"hipFuncSetAttribute error: %d\n", result2);
24296 result2 = hipModuleGetGlobal(&
axis->consts_addr, &size,
axis->VkFFTModule,
"consts");
24297 if (result2 != hipSuccess) {
24298 printf(
"hipModuleGetGlobal error: %d\n", result2);
24309#elif(VKFFT_BACKEND==3)
24310 size_t codelen = strlen(code0);
24311 axis->program = clCreateProgramWithSource(app->
configuration.context[0], 1, (
const char**)&code0, &codelen, &res);
24312 if (res != CL_SUCCESS) {
24319 if (res != CL_SUCCESS) {
24322 char*
log = (
char*)malloc(log_size);
24331 printf(
"%s\n",
log);
24334 printf(
"%s\n", code0);
24341 axis->kernel = clCreateKernel(
axis->program,
"VkFFT_main", &res);
24342 if (res != CL_SUCCESS) {
24352 axis->specializationConstants.code0 = 0;
24355 if (
axis->specializationConstants.axisSwapped) {
24356 uint64_t temp =
axis->axisBlock[1];
24357 axis->axisBlock[1] =
axis->axisBlock[0];
24358 axis->axisBlock[0] = temp;
24359 axis->specializationConstants.axisSwapped = 0;
24370#if(VKFFT_BACKEND==0)
24373 int resGlslangInitialize = glslang_initialize_process();
24383 if (inputLaunchConfiguration.
device == 0) {
24388 if (inputLaunchConfiguration.
queue == 0) {
24398 if (inputLaunchConfiguration.
fence == 0) {
24404 VkPhysicalDeviceProperties physicalDeviceProperties = { 0 };
24417 switch (physicalDeviceProperties.vendorID) {
24457#elif(VKFFT_BACKEND==1)
24458 CUresult res = CUDA_SUCCESS;
24459 cudaError_t res_t = cudaSuccess;
24460 if (inputLaunchConfiguration.
device == 0) {
24465 if (inputLaunchConfiguration.num_streams != 0) app->
configuration.num_streams = inputLaunchConfiguration.num_streams;
24466 if (inputLaunchConfiguration.stream != 0) app->
configuration.stream = inputLaunchConfiguration.stream;
24469 res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, app->
configuration.
device[0]);
24470 if (res != CUDA_SUCCESS) {
24475 res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, app->
configuration.
device[0]);
24476 if (res != CUDA_SUCCESS) {
24481 res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, app->
configuration.
device[0]);
24482 if (res != CUDA_SUCCESS) {
24487 res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, app->
configuration.
device[0]);
24488 if (res != CUDA_SUCCESS) {
24493 res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, app->
configuration.
device[0]);
24494 if (res != CUDA_SUCCESS) {
24499 res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, app->
configuration.
device[0]);
24500 if (res != CUDA_SUCCESS) {
24505 res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, app->
configuration.
device[0]);
24506 if (res != CUDA_SUCCESS) {
24511 res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, app->
configuration.
device[0]);
24512 if (res != CUDA_SUCCESS) {
24517 res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN, app->
configuration.
device[0]);
24518 if (res != CUDA_SUCCESS) {
24523 res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_WARP_SIZE, app->
configuration.
device[0]);
24524 if (res != CUDA_SUCCESS) {
24536 for (uint64_t i = 0; i < app->
configuration.num_streams; i++) {
24537 res_t = cudaEventCreate(&app->
configuration.stream_event[i]);
24538 if (res != CUDA_SUCCESS) {
24551#elif(VKFFT_BACKEND==2)
24552 hipError_t res = hipSuccess;
24553 if (inputLaunchConfiguration.
device == 0) {
24558 if (inputLaunchConfiguration.num_streams != 0) app->
configuration.num_streams = inputLaunchConfiguration.num_streams;
24559 if (inputLaunchConfiguration.stream != 0) app->
configuration.stream = inputLaunchConfiguration.stream;
24562 res = hipDeviceGetAttribute(&value, hipDeviceAttributeMaxThreadsPerBlock, app->
configuration.
device[0]);
24563 if (res != hipSuccess) {
24568 res = hipDeviceGetAttribute(&value, hipDeviceAttributeMaxGridDimX, app->
configuration.
device[0]);
24569 if (res != hipSuccess) {
24574 res = hipDeviceGetAttribute(&value, hipDeviceAttributeMaxGridDimY, app->
configuration.
device[0]);
24575 if (res != hipSuccess) {
24580 res = hipDeviceGetAttribute(&value, hipDeviceAttributeMaxGridDimZ, app->
configuration.
device[0]);
24581 if (res != hipSuccess) {
24586 res = hipDeviceGetAttribute(&value, hipDeviceAttributeMaxBlockDimX, app->
configuration.
device[0]);
24587 if (res != hipSuccess) {
24592 res = hipDeviceGetAttribute(&value, hipDeviceAttributeMaxBlockDimY, app->
configuration.
device[0]);
24593 if (res != hipSuccess) {
24598 res = hipDeviceGetAttribute(&value, hipDeviceAttributeMaxBlockDimZ, app->
configuration.
device[0]);
24599 if (res != hipSuccess) {
24604 res = hipDeviceGetAttribute(&value, hipDeviceAttributeMaxSharedMemoryPerBlock, app->
configuration.
device[0]);
24605 if (res != hipSuccess) {
24612 res = hipDeviceGetAttribute(&value, hipDeviceAttributeWarpSize, app->
configuration.
device[0]);
24613 if (res != hipSuccess) {
24625 for (uint64_t i = 0; i < app->
configuration.num_streams; i++) {
24627 if (res != hipSuccess) {
24639#elif(VKFFT_BACKEND==3)
24641 if (inputLaunchConfiguration.
device == 0) {
24646 if (inputLaunchConfiguration.context == 0) {
24650 app->
configuration.context = inputLaunchConfiguration.context;
24651 if (inputLaunchConfiguration.platform == 0) {
24655 app->
configuration.platform = inputLaunchConfiguration.platform;
24657 size_t value_int64;
24658 cl_uint value_cl_uint;
24659 res = clGetDeviceInfo(app->
configuration.
device[0], CL_DEVICE_VENDOR_ID,
sizeof(cl_int), &vendorID, 0);
24664 res = clGetDeviceInfo(app->
configuration.
device[0], CL_DEVICE_MAX_WORK_GROUP_SIZE,
sizeof(
size_t), &value_int64, 0);
24670 res = clGetDeviceInfo(app->
configuration.
device[0], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS,
sizeof(cl_uint), &value_cl_uint, 0);
24675 size_t* dims = (
size_t*)malloc(
sizeof(
size_t) * value_cl_uint);
24677 res = clGetDeviceInfo(app->
configuration.
device[0], CL_DEVICE_MAX_WORK_ITEM_SIZES,
sizeof(
size_t) * value_cl_uint, dims, 0);
24696 cl_ulong sharedMemorySize;
24697 res = clGetDeviceInfo(app->
configuration.
device[0], CL_DEVICE_LOCAL_MEM_SIZE,
sizeof(cl_ulong), &sharedMemorySize, 0);
24704 switch (vendorID) {
24746 if (inputLaunchConfiguration.
FFTdim == 0) {
24751 if (inputLaunchConfiguration.
size[0] == 0) {
24784 for (uint64_t i = 1; i < 3; i++) {
24785 if (inputLaunchConfiguration.
size[i] == 0)
24812#if(VKFFT_BACKEND==0)
24813 if (inputLaunchConfiguration.
bufferSize == 0) {
24834#if(VKFFT_BACKEND==0)
24865#if(VKFFT_BACKEND==0)
24892#if(VKFFT_BACKEND==0)
24918#if(VKFFT_BACKEND==0)
24919 if (inputLaunchConfiguration.
kernelSize == 0) {
24943 uint64_t checkBufferSizeFor64BitAddressing = 0;
24953 checkBufferSizeFor64BitAddressing = 0;
24960 checkBufferSizeFor64BitAddressing = 0;
24967 checkBufferSizeFor64BitAddressing = 0;
24985 if (inputLaunchConfiguration.
performR2C != 0) {
24988 if (inputLaunchConfiguration.
performDCT != 0) {
25102#if(VKFFT_BACKEND==0)
25215#if(VKFFT_BACKEND==0)
25217 glslang_finalize_process();
25226 uint64_t blockNumber[3] = { (uint64_t)ceil(dispatchBlock[0] / (
double)maxBlockSize[0]),(uint64_t)ceil(dispatchBlock[1] / (
double)maxBlockSize[1]),(uint64_t)ceil(dispatchBlock[2] / (
double)maxBlockSize[2]) };
25227 if (blockNumber[0] == 0) blockNumber[0] = 1;
25228 if (blockNumber[1] == 0) blockNumber[1] = 1;
25229 if (blockNumber[2] == 0) blockNumber[2] = 1;
25230 if ((blockNumber[0] > 1) && (blockNumber[0] * maxBlockSize[0] != dispatchBlock[0])) {
25232 if (dispatchBlock[0] % i == 0) {
25233 maxBlockSize[0] = i;
25234 blockNumber[0] = dispatchBlock[0] / i;
25239 if ((blockNumber[1] > 1) && (blockNumber[1] * maxBlockSize[1] != dispatchBlock[1])) {
25241 if (dispatchBlock[1] % i == 0) {
25242 maxBlockSize[1] = i;
25243 blockNumber[1] = dispatchBlock[1] / i;
25248 if ((blockNumber[2] > 1) && (blockNumber[2] * maxBlockSize[2] != dispatchBlock[2])) {
25250 if (dispatchBlock[2] % i == 0) {
25251 maxBlockSize[2] = i;
25252 blockNumber[2] = dispatchBlock[2] / i;
25259 for (uint64_t i = 0; i < 3; i++)
25260 if (blockNumber[i] == 1) maxBlockSize[i] = dispatchBlock[i];
25261 for (uint64_t i = 0; i < blockNumber[0]; i++) {
25262 for (uint64_t j = 0; j < blockNumber[1]; j++) {
25263 for (uint64_t k = 0; k < blockNumber[2]; k++) {
25264 if (
axis->pushConstants.workGroupShift[0] != i * maxBlockSize[0]) {
25265 axis->pushConstants.workGroupShift[0] = i * maxBlockSize[0];
25266 axis->updatePushConstants = 1;
25268 if (
axis->pushConstants.workGroupShift[1] != j * maxBlockSize[1]) {
25269 axis->pushConstants.workGroupShift[1] = j * maxBlockSize[1];
25270 axis->updatePushConstants = 1;
25272 if (
axis->pushConstants.workGroupShift[2] != k * maxBlockSize[2]) {
25273 axis->pushConstants.workGroupShift[2] = k * maxBlockSize[2];
25274 axis->updatePushConstants = 1;
25276#if(VKFFT_BACKEND==0)
25282 axis->pushConstantsUint32.workGroupShift[0] = (uint32_t)
axis->pushConstants.workGroupShift[0];
25283 axis->pushConstantsUint32.workGroupShift[1] = (uint32_t)
axis->pushConstants.workGroupShift[1];
25284 axis->pushConstantsUint32.workGroupShift[2] = (uint32_t)
axis->pushConstants.workGroupShift[2];
25285 vkCmdPushConstants(app->
configuration.
commandBuffer[0],
axis->pipelineLayout, VK_SHADER_STAGE_COMPUTE_BIT, 0, (uint32_t)sizePushConsts, &
axis->pushConstantsUint32);
25287 vkCmdDispatch(app->
configuration.
commandBuffer[0], (uint32_t)maxBlockSize[0], (uint32_t)maxBlockSize[1], (uint32_t)maxBlockSize[2]);
25288#elif(VKFFT_BACKEND==1)
25290 CUresult result = CUDA_SUCCESS;
25291 args[0] =
axis->inputBuffer;
25292 args[1] =
axis->outputBuffer;
25293 uint64_t args_id = 2;
25294 if (
axis->specializationConstants.convolutionStep) {
25298 if (
axis->specializationConstants.LUT) {
25299 args[args_id] = &
axis->bufferLUT;
25302 if (
axis->specializationConstants.useBluesteinFFT &&
axis->specializationConstants.BluesteinConvolutionStep) {
25303 if (
axis->specializationConstants.inverseBluestein)
25309 if (
axis->specializationConstants.useBluesteinFFT && (
axis->specializationConstants.BluesteinPreMultiplication ||
axis->specializationConstants.BluesteinPostMultiplication)) {
25314 if (
axis->updatePushConstants) {
25315 axis->updatePushConstants = 0;
25318 result = cuMemcpyHtoD(
axis->consts_addr, &
axis->pushConstants, sizePushConsts);
25321 axis->pushConstantsUint32.workGroupShift[0] = (uint32_t)
axis->pushConstants.workGroupShift[0];
25322 axis->pushConstantsUint32.workGroupShift[1] = (uint32_t)
axis->pushConstants.workGroupShift[1];
25323 axis->pushConstantsUint32.workGroupShift[2] = (uint32_t)
axis->pushConstants.workGroupShift[2];
25324 result = cuMemcpyHtoD(
axis->consts_addr, &
axis->pushConstantsUint32, sizePushConsts);
25326 if (result != CUDA_SUCCESS) {
25327 printf(
"cuMemcpyHtoD error: %d\n", result);
25332 result = cuLaunchKernel(
axis->VkFFTKernel,
25333 (
unsigned int)maxBlockSize[0], (
unsigned int)maxBlockSize[1], (
unsigned int)maxBlockSize[2],
25334 (
unsigned int)
axis->specializationConstants.localSize[0], (
unsigned int)
axis->specializationConstants.localSize[1], (
unsigned int)
axis->specializationConstants.localSize[2],
25339 result = cuLaunchKernel(
axis->VkFFTKernel,
25340 (
unsigned int)maxBlockSize[0], (
unsigned int)maxBlockSize[1], (
unsigned int)maxBlockSize[2],
25341 (
unsigned int)
axis->specializationConstants.localSize[0], (
unsigned int)
axis->specializationConstants.localSize[1], (
unsigned int)
axis->specializationConstants.localSize[2],
25342 (
unsigned int)
axis->specializationConstants.usedSharedMemory, 0,
25345 if (result != CUDA_SUCCESS) {
25346 printf(
"cuLaunchKernel error: %d, %" PRIu64
" %" PRIu64
" %" PRIu64
" - %" PRIu64
" %" PRIu64
" %" PRIu64
"\n", result, maxBlockSize[0], maxBlockSize[1], maxBlockSize[2],
axis->specializationConstants.localSize[0],
axis->specializationConstants.localSize[1],
axis->specializationConstants.localSize[2]);
25357#elif(VKFFT_BACKEND==2)
25358 hipError_t result = hipSuccess;
25360 args[0] =
axis->inputBuffer;
25361 args[1] =
axis->outputBuffer;
25362 uint64_t args_id = 2;
25363 if (
axis->specializationConstants.convolutionStep) {
25367 if (
axis->specializationConstants.LUT) {
25368 args[args_id] = &
axis->bufferLUT;
25371 if (
axis->specializationConstants.useBluesteinFFT &&
axis->specializationConstants.BluesteinConvolutionStep) {
25372 if (
axis->specializationConstants.inverseBluestein)
25378 if (
axis->specializationConstants.useBluesteinFFT && (
axis->specializationConstants.BluesteinPreMultiplication ||
axis->specializationConstants.BluesteinPostMultiplication)) {
25383 if (
axis->updatePushConstants) {
25384 axis->updatePushConstants = 0;
25387 result = hipMemcpyHtoD(
axis->consts_addr, &
axis->pushConstants, sizePushConsts);
25390 axis->pushConstantsUint32.workGroupShift[0] = (uint32_t)
axis->pushConstants.workGroupShift[0];
25391 axis->pushConstantsUint32.workGroupShift[1] = (uint32_t)
axis->pushConstants.workGroupShift[1];
25392 axis->pushConstantsUint32.workGroupShift[2] = (uint32_t)
axis->pushConstants.workGroupShift[2];
25393 result = hipMemcpyHtoD(
axis->consts_addr, &
axis->pushConstantsUint32, sizePushConsts);
25395 if (result != hipSuccess) {
25396 printf(
"hipMemcpyHtoD error: %d\n", result);
25402 result = hipModuleLaunchKernel(
axis->VkFFTKernel,
25403 (
unsigned int)maxBlockSize[0], (
unsigned int)maxBlockSize[1], (
unsigned int)maxBlockSize[2],
25404 (
unsigned int)
axis->specializationConstants.localSize[0], (
unsigned int)
axis->specializationConstants.localSize[1], (
unsigned int)
axis->specializationConstants.localSize[2],
25409 result = hipModuleLaunchKernel(
axis->VkFFTKernel,
25410 (
unsigned int)maxBlockSize[0], (
unsigned int)maxBlockSize[1], (
unsigned int)maxBlockSize[2],
25411 (
unsigned int)
axis->specializationConstants.localSize[0], (
unsigned int)
axis->specializationConstants.localSize[1], (
unsigned int)
axis->specializationConstants.localSize[2],
25412 (
unsigned int)
axis->specializationConstants.usedSharedMemory, 0,
25415 if (result != hipSuccess) {
25416 printf(
"hipModuleLaunchKernel error: %d, %" PRIu64
" %" PRIu64
" %" PRIu64
" - %" PRIu64
" %" PRIu64
" %" PRIu64
"\n", result, maxBlockSize[0], maxBlockSize[1], maxBlockSize[2],
axis->specializationConstants.localSize[0],
axis->specializationConstants.localSize[1],
axis->specializationConstants.localSize[2]);
25427#elif(VKFFT_BACKEND==3)
25428 cl_int result = CL_SUCCESS;
25430 args[0] =
axis->inputBuffer;
25431 result = clSetKernelArg(
axis->kernel, 0,
sizeof(cl_mem), args[0]);
25432 if (result != CL_SUCCESS) {
25435 args[1] =
axis->outputBuffer;
25436 result = clSetKernelArg(
axis->kernel, 1,
sizeof(cl_mem), args[1]);
25437 if (result != CL_SUCCESS) {
25440 uint64_t args_id = 2;
25441 if (
axis->specializationConstants.convolutionStep) {
25443 result = clSetKernelArg(
axis->kernel, (cl_uint)args_id,
sizeof(cl_mem), args[args_id]);
25444 if (result != CL_SUCCESS) {
25449 if (
axis->specializationConstants.LUT) {
25450 args[args_id] = &
axis->bufferLUT;
25451 result = clSetKernelArg(
axis->kernel, (cl_uint)args_id,
sizeof(cl_mem), args[args_id]);
25452 if (result != CL_SUCCESS) {
25457 if (
axis->specializationConstants.useBluesteinFFT &&
axis->specializationConstants.BluesteinConvolutionStep) {
25458 if (
axis->specializationConstants.inverseBluestein)
25462 result = clSetKernelArg(
axis->kernel, (cl_uint)args_id,
sizeof(cl_mem), args[args_id]);
25463 if (result != CL_SUCCESS) {
25468 if (
axis->specializationConstants.useBluesteinFFT && (
axis->specializationConstants.BluesteinPreMultiplication ||
axis->specializationConstants.BluesteinPostMultiplication)) {
25470 result = clSetKernelArg(
axis->kernel, (cl_uint)args_id,
sizeof(cl_mem), args[args_id]);
25471 if (result != CL_SUCCESS) {
25479 result = clSetKernelArg(
axis->kernel, (cl_uint)args_id, sizePushConsts, &
axis->pushConstants);
25482 axis->pushConstantsUint32.workGroupShift[0] = (uint32_t)
axis->pushConstants.workGroupShift[0];
25483 axis->pushConstantsUint32.workGroupShift[1] = (uint32_t)
axis->pushConstants.workGroupShift[1];
25484 axis->pushConstantsUint32.workGroupShift[2] = (uint32_t)
axis->pushConstants.workGroupShift[2];
25485 result = clSetKernelArg(
axis->kernel, (cl_uint)args_id, sizePushConsts, &
axis->pushConstantsUint32);
25487 if (result != CL_SUCCESS) {
25491 size_t local_work_size[3] = { (size_t)
axis->specializationConstants.localSize[0], (
size_t)
axis->specializationConstants.localSize[1],(size_t)
axis->specializationConstants.localSize[2] };
25492 size_t global_work_size[3] = { (size_t)maxBlockSize[0] * local_work_size[0] , (
size_t)maxBlockSize[1] * local_work_size[1] ,(size_t)maxBlockSize[2] * local_work_size[2] };
25493 result = clEnqueueNDRangeKernel(app->
configuration.commandQueue[0],
axis->kernel, 3, 0, global_work_size, local_work_size, 0, 0, 0);
25496 if (result != CL_SUCCESS) {
25506#if(VKFFT_BACKEND==0)
25508#elif(VKFFT_BACKEND==1)
25510 cudaError_t res = cudaSuccess;
25511 for (uint64_t s = 0; s < app->
configuration.num_streams; s++) {
25512 res = cudaEventSynchronize(app->
configuration.stream_event[s]);
25517#elif(VKFFT_BACKEND==2)
25519 hipError_t res = hipSuccess;
25520 for (uint64_t s = 0; s < app->
configuration.num_streams; s++) {
25521 res = hipEventSynchronize(app->
configuration.stream_event[s]);
25526#elif(VKFFT_BACKEND==3)
25534 printf(
"read: inputBuffer\n");
25536 printf(
"read: buffer\n");
25538 printf(
"read: tempBuffer\n");
25540 printf(
"read: outputBuffer\n");
25542 printf(
"write: inputBuffer\n");
25544 printf(
"write: buffer\n");
25546 printf(
"write: tempBuffer\n");
25548 printf(
"write: outputBuffer\n");
25553#if(VKFFT_BACKEND==0)
25555 VkMemoryBarrier memory_barrier = {
25556 VK_STRUCTURE_TYPE_MEMORY_BARRIER,
25558 VK_ACCESS_SHADER_WRITE_BIT,
25559 VK_ACCESS_SHADER_READ_BIT,
25562#elif(VKFFT_BACKEND==1)
25564#elif(VKFFT_BACKEND==2)
25566#elif(VKFFT_BACKEND==3)
25567 app->
configuration.commandQueue = launchParams->commandQueue;
25569 uint64_t localSize0[3];
25596#if(VKFFT_BACKEND==0)
25600 uint64_t dispatchBlock[3];
25622 if (
axis->specializationConstants.mergeSequencesR2C == 1) dispatchBlock[1] = (uint64_t)ceil(dispatchBlock[1] / 2.0);
25637#if(VKFFT_BACKEND==0)
25641 uint64_t dispatchBlock[3];
25663 if (
axis->specializationConstants.mergeSequencesR2C == 1) dispatchBlock[1] = (uint64_t)ceil(dispatchBlock[1] / 2.0);
25679#if(VKFFT_BACKEND==0)
25683 uint64_t dispatchBlock[3];
25686 dispatchBlock[1] = 1;
25712 uint64_t dispatchBlock[3];
25714 dispatchBlock[1] = 1;
25732#if(VKFFT_BACKEND==0)
25736 uint64_t dispatchBlock[3];
25739 dispatchBlock[1] = 1;
25755#if(VKFFT_BACKEND==0)
25759 uint64_t dispatchBlock[3];
25761 dispatchBlock[1] = 1;
25791 uint64_t dispatchBlock[3];
25793 dispatchBlock[1] = 1;
25809#if(VKFFT_BACKEND==0)
25813 uint64_t dispatchBlock[3];
25815 dispatchBlock[1] = 1;
25829#if(VKFFT_BACKEND==0)
25833 uint64_t dispatchBlock[3];
25835 dispatchBlock[1] = 1;
25862#if(VKFFT_BACKEND==0)
25866 uint64_t dispatchBlock[3];
25868 dispatchBlock[1] = 1;
25884#if(VKFFT_BACKEND==0)
25888 uint64_t dispatchBlock[3];
25890 dispatchBlock[1] = 1;
25910#if(VKFFT_BACKEND==0)
25914 uint64_t dispatchBlock[3];
25916 dispatchBlock[1] = 1;
25932#if(VKFFT_BACKEND==0)
25936 uint64_t dispatchBlock[3];
25958 if (
axis->specializationConstants.mergeSequencesR2C == 1) dispatchBlock[1] = (uint64_t)ceil(dispatchBlock[1] / 2.0);
25974#if(VKFFT_BACKEND==0)
25978 uint64_t dispatchBlock[3];
25980 dispatchBlock[1] = 1;
26004#if(VKFFT_BACKEND==0)
26008 uint64_t dispatchBlock[3];
26010 dispatchBlock[1] = 1;
26035#if(VKFFT_BACKEND==0)
26039 uint64_t dispatchBlock[3];
26041 dispatchBlock[1] = 1;
26063#if(VKFFT_BACKEND==0)
26067 uint64_t dispatchBlock[3];
26070 dispatchBlock[1] = 1;
26085#if(VKFFT_BACKEND==0)
26089 uint64_t dispatchBlock[3];
26111 if (
axis->specializationConstants.mergeSequencesR2C == 1) dispatchBlock[1] = (uint64_t)ceil(dispatchBlock[1] / 2.0);
26127#if(VKFFT_BACKEND==0)
26131 uint64_t dispatchBlock[3];
26153 if (
axis->specializationConstants.mergeSequencesR2C == 1) dispatchBlock[1] = (uint64_t)ceil(dispatchBlock[1] / 2.0);
static VkFFTResult VkMovReal(VkFFTSpecializationConstantsLayout *sc, const char *out, const char *in)
static VkFFTResult VkMulComplexNumber(VkFFTSpecializationConstantsLayout *sc, const char *out, const char *in_1, const char *in_num)
static int VkFFTGetVersion()
static VkFFTResult VkMovComplex(VkFFTSpecializationConstantsLayout *sc, const char *out, const char *in)
static VkFFTResult appendPreparationBatchedKernelConvolution(VkFFTSpecializationConstantsLayout *sc, const char *floatType, const char *floatTypeMemory, const char *uintType, uint64_t dataType)
static VkFFTResult shaderGenVkFFT_R2C_decomposition(char *output, VkFFTSpecializationConstantsLayout *sc, const char *floatType, const char *floatTypeInputMemory, const char *floatTypeOutputMemory, const char *floatTypeKernelMemory, const char *uintType, uint64_t type)
static VkFFTResult transferDataFromCPU(VkFFTApplication *app, void *arr, VkBuffer *buffer, VkDeviceSize bufferSize)
static VkFFTResult appendReorder4StepRead(VkFFTSpecializationConstantsLayout *sc, const char *floatType, const char *uintType, uint64_t reorderType)
static VkFFTResult appendZeropadEndReadWriteStage(VkFFTSpecializationConstantsLayout *sc)
static VkFFTResult appendRadixShuffleNonStrided(VkFFTSpecializationConstantsLayout *sc, const char *floatType, const char *uintType, uint64_t stageSize, uint64_t stageSizeSum, double stageAngle, uint64_t stageRadix, uint64_t stageRadixNext)
static VkFFTResult appendLayoutVkFFT(VkFFTSpecializationConstantsLayout *sc)
static VkFFTResult VkSharedLoad(VkFFTSpecializationConstantsLayout *sc, const char *out, const char *id)
static VkFFTResult appendVersion(VkFFTSpecializationConstantsLayout *sc)
static VkFFTResult VkFFTSync(VkFFTApplication *app)
static VkFFTResult appendKernelLayoutVkFFT(VkFFTSpecializationConstantsLayout *sc, uint64_t id, const char *floatTypeMemory)
static VkFFTResult VkSubReal(VkFFTSpecializationConstantsLayout *sc, const char *out, const char *in_1, const char *in_2)
static VkFFTResult appendBoostThreadDataReorder(VkFFTSpecializationConstantsLayout *sc, const char *floatType, const char *uintType, uint64_t shuffleType, uint64_t start)
static VkFFTResult appendSharedMemoryVkFFT(VkFFTSpecializationConstantsLayout *sc, const char *floatType, const char *uintType, uint64_t sharedType)
static VkFFTResult dispatchEnhanced(VkFFTApplication *app, VkFFTAxis *axis, uint64_t *dispatchBlock)
static VkFFTResult VkFFTUpdateBufferSetR2CMultiUploadDecomposition(VkFFTApplication *app, VkFFTPlan *FFTPlan, VkFFTAxis *axis, uint64_t axis_id, uint64_t axis_upload_id, uint64_t inverse)
static VkFFTResult appendExtensions(VkFFTSpecializationConstantsLayout *sc, const char *floatType, const char *floatTypeInputMemory, const char *floatTypeOutputMemory, const char *floatTypeKernelMemory)
static VkFFTResult VkFFTGetRegistersPerThread(uint64_t *loc_multipliers, uint64_t *registers_per_thread_per_radix, uint64_t *registers_per_thread, uint64_t *min_registers_per_thread, uint64_t *isGoodSequence)
static VkFFTResult VkDivComplexNumber(VkFFTSpecializationConstantsLayout *sc, const char *out, const char *in_1, const char *in_num)
static VkFFTResult appendWriteDataVkFFT(VkFFTSpecializationConstantsLayout *sc, const char *floatType, const char *floatTypeMemory, const char *uintType, uint64_t writeType)
static VkFFTResult appendLicense(VkFFTSpecializationConstantsLayout *sc)
static void freeShaderGenVkFFT(VkFFTSpecializationConstantsLayout *sc)
static VkFFTResult initializeVkFFT(VkFFTApplication *app, VkFFTConfiguration inputLaunchConfiguration)
static VkFFTResult appendKernelConvolution(VkFFTSpecializationConstantsLayout *sc, const char *floatType, const char *floatTypeMemory, const char *uintType, uint64_t dataType)
static VkFFTResult appendConstantsVkFFT(VkFFTSpecializationConstantsLayout *sc, const char *floatType, const char *uintType)
static VkFFTResult appendPushConstantsVkFFT(VkFFTSpecializationConstantsLayout *sc, const char *floatType, const char *uintType)
static VkFFTResult VkSubComplex(VkFFTSpecializationConstantsLayout *sc, const char *out, const char *in_1, const char *in_2)
static VkFFTResult appendSetSMToZero(VkFFTSpecializationConstantsLayout *sc, const char *floatType, const char *floatTypeMemory, const char *uintType, uint64_t readType)
static VkFFTResult appendCoordinateRegisterStore(VkFFTSpecializationConstantsLayout *sc, uint64_t readType)
static VkFFTResult VkFFTUpdateBufferSet(VkFFTApplication *app, VkFFTPlan *FFTPlan, VkFFTAxis *axis, uint64_t axis_id, uint64_t axis_upload_id, uint64_t inverse)
static VkFFTResult appendSinCos20(VkFFTSpecializationConstantsLayout *sc, const char *floatType, const char *uintType)
static VkFFTResult VkSharedStore(VkFFTSpecializationConstantsLayout *sc, const char *id, const char *in)
static VkFFTResult appendRadixStageNonStrided(VkFFTSpecializationConstantsLayout *sc, const char *floatType, const char *uintType, uint64_t stageSize, uint64_t stageSizeSum, double stageAngle, uint64_t stageRadix)
static void deleteAxis(VkFFTApplication *app, VkFFTAxis *axis)
static VkFFTResult VkFMAComplex(VkFFTSpecializationConstantsLayout *sc, const char *out, const char *in_1, const char *in_num, const char *in_2)
static VkFFTResult appendBluesteinLayoutVkFFT(VkFFTSpecializationConstantsLayout *sc, uint64_t id, const char *floatType)
static VkFFTResult appendOutputLayoutVkFFT(VkFFTSpecializationConstantsLayout *sc, uint64_t id, const char *floatTypeMemory, uint64_t outputType)
static VkFFTResult VkShuffleComplexInv(VkFFTSpecializationConstantsLayout *sc, const char *out, const char *in_1, const char *in_2, const char *temp)
static VkFFTResult setWriteFromRegisters(VkFFTSpecializationConstantsLayout *sc, uint64_t writeType)
static VkFFTResult VkAppendLineFromInput(VkFFTSpecializationConstantsLayout *sc, const char *in)
static VkFFTResult findMemoryType(VkFFTApplication *app, uint64_t memoryTypeBits, uint64_t memorySize, VkMemoryPropertyFlags properties, uint32_t *memoryTypeIndex)
static VkFFTResult VkMulComplexConj(VkFFTSpecializationConstantsLayout *sc, const char *out, const char *in_1, const char *in_2, const char *temp)
static VkFFTResult appendInitialization(VkFFTSpecializationConstantsLayout *sc, const char *floatType, const char *uintType, uint64_t initType)
static VkFFTResult VkShuffleComplex(VkFFTSpecializationConstantsLayout *sc, const char *out, const char *in_1, const char *in_2, const char *temp)
static VkFFTResult appendZeropadStartReadWriteStage(VkFFTSpecializationConstantsLayout *sc, uint64_t readStage)
static VkFFTResult shaderGenVkFFT(char *output, VkFFTSpecializationConstantsLayout *sc, const char *floatType, const char *floatTypeInputMemory, const char *floatTypeOutputMemory, const char *floatTypeKernelMemory, const char *uintType, uint64_t type)
static VkFFTResult VkFFTPlanAxis(VkFFTApplication *app, VkFFTPlan *FFTPlan, uint64_t axis_id, uint64_t axis_upload_id, uint64_t inverse, uint64_t reverseBluesteinMultiUpload)
static VkFFTResult VkFFTScheduler(VkFFTApplication *app, VkFFTPlan *FFTPlan, uint64_t axis_id, uint64_t supportAxis)
static VkFFTResult appendRegisterBoostShuffle(VkFFTSpecializationConstantsLayout *sc, const char *floatType, uint64_t stageSize, uint64_t stageRadixPrev, uint64_t stageRadix, double stageAngle)
static VkFFTResult VkFFTAppend(VkFFTApplication *app, int inverse, VkFFTLaunchParams *launchParams)
static VkFFTResult appendLUTLayoutVkFFT(VkFFTSpecializationConstantsLayout *sc, uint64_t id, const char *floatType)
static VkFFTResult VkAddComplexInv(VkFFTSpecializationConstantsLayout *sc, const char *out, const char *in_1, const char *in_2)
static VkFFTResult appendReorder4StepWrite(VkFFTSpecializationConstantsLayout *sc, const char *floatType, const char *uintType, uint64_t reorderType)
static void deleteVkFFT(VkFFTApplication *app)
static VkFFTResult appendReadDataVkFFT(VkFFTSpecializationConstantsLayout *sc, const char *floatType, const char *floatTypeMemory, const char *uintType, uint64_t readType)
static VkFFTResult setReadToRegisters(VkFFTSpecializationConstantsLayout *sc, uint64_t readType)
static VkFFTResult appendZeropadEnd(VkFFTSpecializationConstantsLayout *sc)
static VkFFTResult appendBarrierVkFFT(VkFFTSpecializationConstantsLayout *sc, uint64_t numTab)
static VkFFTResult appendPushConstant(VkFFTSpecializationConstantsLayout *sc, const char *type, const char *name)
static VkFFTResult appendInputLayoutVkFFT(VkFFTSpecializationConstantsLayout *sc, uint64_t id, const char *floatTypeMemory, uint64_t inputType)
static VkFFTResult VkMulComplex(VkFFTSpecializationConstantsLayout *sc, const char *out, const char *in_1, const char *in_2, const char *temp)
static VkFFTResult VkModReal(VkFFTSpecializationConstantsLayout *sc, const char *out, const char *in_1, const char *in_num)
static VkFFTResult appendRadixStage(VkFFTSpecializationConstantsLayout *sc, const char *floatType, const char *uintType, uint64_t stageSize, uint64_t stageSizeSum, double stageAngle, uint64_t stageRadix, uint64_t shuffleType)
static VkFFTResult appendRadixShuffleStrided(VkFFTSpecializationConstantsLayout *sc, const char *floatType, const char *uintType, uint64_t stageSize, uint64_t stageSizeSum, double stageAngle, uint64_t stageRadix, uint64_t stageRadixNext)
static VkFFTResult appendConversion(VkFFTSpecializationConstantsLayout *sc, const char *floatType, const char *floatTypeDifferent)
static VkFFTResult VkFFTPlanR2CMultiUploadDecomposition(VkFFTApplication *app, VkFFTPlan *FFTPlan, uint64_t inverse)
static VkFFTResult VkDivReal(VkFFTSpecializationConstantsLayout *sc, const char *out, const char *in_1, const char *in_num)
static VkFFTResult appendRadixShuffle(VkFFTSpecializationConstantsLayout *sc, const char *floatType, const char *uintType, uint64_t stageSize, uint64_t stageSizeSum, double stageAngle, uint64_t stageRadix, uint64_t stageRadixNext, uint64_t shuffleType)
static VkFFTResult appendConstant(VkFFTSpecializationConstantsLayout *sc, const char *type, const char *name, const char *defaultVal, const char *LFending)
static VkFFTResult appendZeropadStart(VkFFTSpecializationConstantsLayout *sc)
static VkFFTResult VkFMAReal(VkFFTSpecializationConstantsLayout *sc, const char *out, const char *in_1, const char *in_num, const char *in_2)
static VkFFTResult appendBluesteinMultiplication(VkFFTSpecializationConstantsLayout *sc, const char *floatType, const char *uintType, uint64_t strideType, uint64_t pre_or_post_multiplication)
static VkFFTResult indexOutputVkFFT(VkFFTSpecializationConstantsLayout *sc, const char *uintType, uint64_t outputType, const char *index_x, const char *index_y, const char *coordinate, const char *batchID)
static VkFFTResult indexInputVkFFT(VkFFTSpecializationConstantsLayout *sc, const char *uintType, uint64_t inputType, const char *index_x, const char *index_y, const char *coordinate, const char *batchID)
static VkFFTResult allocateFFTBuffer(VkFFTApplication *app, VkBuffer *buffer, VkDeviceMemory *deviceMemory, VkBufferUsageFlags usageFlags, VkMemoryPropertyFlags propertyFlags, VkDeviceSize size)
static VkFFTResult VkPermute(VkFFTSpecializationConstantsLayout *sc, const uint64_t *permute, const uint64_t num_elem, const uint64_t type, char **regIDs)
static VkFFTResult inlineRadixKernelVkFFT(VkFFTSpecializationConstantsLayout *sc, const char *floatType, const char *uintType, uint64_t radix, uint64_t stageSize, double stageAngle, char **regID)
static VkFFTResult VkAddComplex(VkFFTSpecializationConstantsLayout *sc, const char *out, const char *in_1, const char *in_2)
static VkFFTResult appendCoordinateRegisterPull(VkFFTSpecializationConstantsLayout *sc, uint64_t readType)
static VkFFTResult VkFFTCheckUpdateBufferSet(VkFFTApplication *app, VkFFTAxis *axis, uint64_t planStage, VkFFTLaunchParams *launchParams)
static VkFFTResult VkAppendLine(VkFFTSpecializationConstantsLayout *sc)
static VkFFTResult appendBluesteinConvolution(VkFFTSpecializationConstantsLayout *sc, const char *floatType, const char *floatTypeMemory, const char *uintType, uint64_t dataType)
static VkFFTResult VkAddReal(VkFFTSpecializationConstantsLayout *sc, const char *out, const char *in_1, const char *in_2)
static VkFFTResult VkMulComplexNumberImag(VkFFTSpecializationConstantsLayout *sc, const char *out, const char *in_1, const char *in_num, const char *temp)
static VkFFTResult VkMulReal(VkFFTSpecializationConstantsLayout *sc, const char *out, const char *in_1, const char *in_2)
static void printDebugInformation(VkFFTApplication *app, VkFFTAxis *axis)
static VkFFTResult VkFFTGeneratePhaseVectors(VkFFTApplication *app, VkFFTPlan *FFTPlan, uint64_t axis_id, uint64_t supportAxis)
static VkFFTResult appendRadixStageStrided(VkFFTSpecializationConstantsLayout *sc, const char *floatType, const char *uintType, uint64_t stageSize, uint64_t stageSizeSum, double stageAngle, uint64_t stageRadix)
@ VKFFT_ERROR_EMPTY_inputBuffer
@ VKFFT_ERROR_FAILED_TO_CREATE_PROGRAM
@ VKFFT_ERROR_FAILED_TO_RELEASE_COMMAND_QUEUE
@ VKFFT_ERROR_FAILED_TO_GET_FUNCTION
@ VKFFT_ERROR_INVALID_FENCE
@ VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH
@ VKFFT_ERROR_FAILED_TO_ALLOCATE_MEMORY
@ VKFFT_ERROR_INVALID_PLATFORM
@ VKFFT_ERROR_FAILED_TO_RESET_FENCES
@ VKFFT_ERROR_UNSUPPORTED_RADIX
@ VKFFT_ERROR_FAILED_TO_COPY
@ VKFFT_ERROR_FAILED_SHADER_PREPROCESS
@ VKFFT_ERROR_EMPTY_FFTdim
@ VKFFT_ERROR_EMPTY_kernel
@ VKFFT_ERROR_INSUFFICIENT_CODE_BUFFER
@ VKFFT_ERROR_FAILED_TO_SET_KERNEL_ARG
@ VKFFT_ERROR_FAILED_TO_GET_CODE_SIZE
@ VKFFT_ERROR_EMPTY_inputBufferSize
@ VKFFT_ERROR_EMPTY_tempBuffer
@ VKFFT_ERROR_FAILED_TO_CREATE_SHADER_MODULE
@ VKFFT_ERROR_FAILED_TO_ADD_NAME_EXPRESSION
@ VKFFT_ERROR_EMPTY_outputBuffer
@ VKFFT_ERROR_FAILED_TO_BEGIN_COMMAND_BUFFER
@ VKFFT_ERROR_FAILED_TO_END_COMMAND_BUFFER
@ VKFFT_ERROR_INSUFFICIENT_TEMP_BUFFER
@ VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_QUEUE
@ VKFFT_ERROR_FAILED_TO_MODULE_GET_GLOBAL
@ VKFFT_ERROR_FAILED_TO_DESTROY_PROGRAM
@ VKFFT_ERROR_FAILED_TO_BIND_BUFFER_MEMORY
@ VKFFT_ERROR_FAILED_TO_GET_CODE
@ VKFFT_ERROR_PLAN_NOT_INITIALIZED
@ VKFFT_ERROR_INVALID_COMMAND_POOL
@ VKFFT_ERROR_FAILED_TO_WAIT_FOR_FENCES
@ VKFFT_ERROR_FAILED_TO_CREATE_DESCRIPTOR_POOL
@ VKFFT_ERROR_EMPTY_bufferSize
@ VKFFT_ERROR_EMPTY_tempBufferSize
@ VKFFT_ERROR_FAILED_TO_CREATE_BUFFER
@ VKFFT_ERROR_INVALID_CONTEXT
@ VKFFT_ERROR_FAILED_TO_INITIALIZE
@ VKFFT_ERROR_FAILED_TO_LAUNCH_KERNEL
@ VKFFT_ERROR_ONLY_INVERSE_FFT_INITIALIZED
@ VKFFT_ERROR_EMPTY_buffer
@ VKFFT_ERROR_INVALID_DEVICE
@ VKFFT_ERROR_FAILED_TO_ALLOCATE_DESCRIPTOR_SETS
@ VKFFT_ERROR_FAILED_TO_GET_ATTRIBUTE
@ VKFFT_ERROR_FAILED_TO_CREATE_EVENT
@ VKFFT_ERROR_FAILED_TO_SYNCHRONIZE
@ VKFFT_ERROR_FAILED_TO_CREATE_PIPELINE_LAYOUT
@ VKFFT_ERROR_UNSUPPORTED_FFT_OMIT
@ VKFFT_ERROR_FAILED_TO_CREATE_PIPELINE
@ VKFFT_ERROR_FAILED_TO_LOAD_MODULE
@ VKFFT_ERROR_FAILED_TO_CREATE_DESCRIPTOR_SET_LAYOUT
@ VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH_DCT
@ VKFFT_ERROR_FAILED_TO_ALLOCATE
@ VKFFT_ERROR_EMPTY_kernelSize
@ VKFFT_ERROR_FAILED_TO_MAP_MEMORY
@ VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH_R2C
@ VKFFT_ERROR_FAILED_TO_SUBMIT_QUEUE
@ VKFFT_ERROR_FAILED_TO_FIND_MEMORY
@ VKFFT_ERROR_FAILED_SHADER_LINK
@ VKFFT_ERROR_INVALID_QUEUE
@ VKFFT_ERROR_MALLOC_FAILED
@ VKFFT_ERROR_FAILED_TO_ALLOCATE_COMMAND_BUFFERS
@ VKFFT_ERROR_ONLY_FORWARD_FFT_INITIALIZED
@ VKFFT_ERROR_FAILED_TO_EVENT_RECORD
@ VKFFT_ERROR_FAILED_SHADER_PARSE
@ VKFFT_ERROR_FAILED_TO_COMPILE_PROGRAM
@ VKFFT_ERROR_FAILED_TO_SET_DYNAMIC_SHARED_MEMORY
@ VKFFT_ERROR_INVALID_PHYSICAL_DEVICE
@ VKFFT_ERROR_NULL_TEMP_PASSED
@ VKFFT_ERROR_EMPTY_outputBufferSize
for(uint32_t i=0;i< queueCount;i++)
if(err !=VK_SUCCESS)
Creates the platform specific surface abstraction of the native platform window used for presentation...
DYN_FUNC Complex< Real > log(const Complex< Real > &)
DYN_FUNC Complex< Real > sin(const Complex< Real > &)
DYN_FUNC Complex< Real > pow(const Complex< Real > &, const Real &)
DYN_FUNC Complex< Real > cos(const Complex< Real > &)
DYN_FUNC Complex< Real > sqrt(const Complex< Real > &)
TEMPLATE_TYPENAME_T VEC3_T axis(QUAT_T const &q)
TEMPLATE_TYPENAME_T MAT4_T scale(MAT4_T const &m, VEC3_T const &v)
TEMPLATE_TYPENAME_T T angle(QUAT_T const &q)
TEMPLATE_TYPENAME_T QUAT_T inverse(QUAT_T const &q)
VkComputePipelineCreateInfo computePipelineCreateInfo(VkPipelineLayout layout, VkPipelineCreateFlags flags=0)
VkDescriptorSetLayoutCreateInfo descriptorSetLayoutCreateInfo(const VkDescriptorSetLayoutBinding *pBindings, uint32_t bindingCount)
VkDescriptorPoolSize descriptorPoolSize(VkDescriptorType type, uint32_t descriptorCount)
VkDescriptorPoolCreateInfo descriptorPoolCreateInfo(uint32_t poolSizeCount, VkDescriptorPoolSize *pPoolSizes, uint32_t maxSets)
VkCommandBufferAllocateInfo commandBufferAllocateInfo(VkCommandPool commandPool, VkCommandBufferLevel level, uint32_t bufferCount)
VkPushConstantRange pushConstantRange(VkShaderStageFlags stageFlags, uint32_t size, uint32_t offset)
VkWriteDescriptorSet writeDescriptorSet(VkDescriptorSet dstSet, VkDescriptorType type, uint32_t binding, VkDescriptorBufferInfo *bufferInfo, uint32_t descriptorCount=1)
VkPipelineLayoutCreateInfo pipelineLayoutCreateInfo(const VkDescriptorSetLayout *pSetLayouts, uint32_t setLayoutCount=1)
VkDescriptorSetAllocateInfo descriptorSetAllocateInfo(VkDescriptorPool descriptorPool, const VkDescriptorSetLayout *pSetLayouts, uint32_t descriptorSetCount)
VkMemoryAllocateInfo memoryAllocateInfo()
VkCommandBufferBeginInfo commandBufferBeginInfo()
VkBufferCreateInfo bufferCreateInfo()
VkSubmitInfo submitInfo()
VkFFTConfiguration configuration
VkBuffer bufferBluesteinFFT[3]
uint64_t useBluesteinFFT[3]
VkBuffer bufferBluestein[3]
VkBuffer bufferBluesteinIFFT[3]
uint64_t actualNumBatches
VkDeviceMemory bufferBluesteinIFFTDeviceMemory[3]
VkDeviceMemory bufferBluesteinFFTDeviceMemory[3]
uint64_t bufferBluesteinSize[3]
VkFFTPlan * localFFTPlan_inverse
VkDeviceMemory bufferBluesteinDeviceMemory[3]
VkDeviceMemory bufferLUTDeviceMemory
VkFFTSpecializationConstantsLayout specializationConstants
uint64_t inputBufferOffset
uint64_t printMemoryLayout
uint64_t outputBufferOffset
uint64_t disableMergeSequencesR2C
uint64_t registerBoost4Step
uint64_t sharedMemorySize
uint64_t makeForwardPlanOnly
uint64_t isCompilerInitialized
uint64_t fixMaxRadixBluestein
uint64_t coordinateFeatures
uint64_t isOutputFormatted
VkMemoryBarrier * memory_barrier
uint64_t maxComputeWorkGroupSize[3]
uint64_t doublePrecisionFloatMemory
uint64_t makeInversePlanOnly
uint64_t * inputBufferSize
VkCommandPool * commandPool
uint64_t halfPrecisionMemoryOnly
VkPhysicalDevice * physicalDevice
uint64_t swapTo3Stage4Step
uint64_t isInputFormatted
uint64_t matrixConvolution
uint64_t performConvolution
uint64_t frequencyZeroPadding
uint64_t registerBoostNonPow2
uint64_t tempBufferOffset
uint64_t considerAllAxesStrided
uint64_t omitDimension[3]
uint64_t sharedMemorySizePow2
uint64_t performBandwidthBoost
uint64_t fft_zeropad_left[3]
VkDeviceMemory tempBufferDeviceMemory
uint64_t * tempBufferSize
VkCommandBuffer * commandBuffer
uint64_t allocateTempBuffer
uint64_t performZeropadding[3]
uint64_t sharedMemorySizeStatic
uint64_t inverseReturnToInputBuffer
uint64_t disableReorderFourStep
uint64_t inputBufferStride[3]
uint64_t kernelConvolution
uint64_t crossPowerSpectrumNormalization
uint64_t outputBufferStride[3]
uint64_t * outputBufferSize
uint64_t conjugateConvolution
uint64_t maxComputeWorkGroupCount[3]
uint64_t fft_zeropad_right[3]
VkCommandBuffer * commandBuffer
uint64_t numAxisUploads[3]
VkFFTAxis R2Cdecomposition
VkFFTAxis inverseBluesteinAxes[3][4]
uint64_t actualFFTSizePerAxis[3][3]
uint64_t actualPerformR2CPerAxis[3]
uint64_t outputBufferBlockNum
uint64_t resolveBankConflictFirstStages
uint64_t BluesteinPostMultiplication
uint64_t performBandwidthBoost
uint64_t BluesteinConvolutionStep
char gl_WorkGroupSize_y[50]
uint64_t inverseBluestein
uint64_t fft_zeropad_right_write[3]
uint64_t sharedStrideReadWriteConflict
uint64_t fft_zeropad_Bluestein_left_write[3]
uint64_t frequencyZeropadding
char gl_WorkGroupID_y[50]
char gl_WorkGroupSize_z[50]
char blockInvocationID[50]
uint64_t performBufferSetUpdate
uint64_t sharedStrideBankConflictFirstStages
char stageInvocationID[50]
uint64_t kernelNumberByteSize
char gl_LocalInvocationID_y[50]
uint64_t performZeropaddingFull[3]
uint64_t mergeSequencesR2C
uint64_t fft_zeropad_right_read[3]
uint64_t registers_per_thread
uint64_t inputNumberByteSize
uint64_t writeFromRegisters
uint64_t crossPowerSpectrumNormalization
uint64_t inputBufferBlockSize
uint64_t firstStageStartSize
uint64_t inputBufferBlockNum
uint64_t fft_zeropad_left_read[3]
uint64_t BluesteinPreMultiplication
char gl_WorkGroupID_x[50]
uint64_t conjugateConvolution
char gl_GlobalInvocationID_z[200]
uint64_t matrixConvolution
uint64_t min_registers_per_thread
uint64_t outputNumberByteSize
uint64_t zeropadBluestein[2]
uint64_t outputBufferBlockSize
char gl_LocalInvocationID_z[50]
char gl_GlobalInvocationID_x[200]
uint64_t dispatchZactualFFTSize
char gl_WorkGroupSize_x[50]
char gl_WorkGroupID_z[50]
char * disableThreadsStart
uint64_t fft_zeropad_left_write[3]
char gl_LocalInvocationID_x[50]
uint64_t registers_per_thread_per_radix[14]
uint64_t usedSharedMemory
uint64_t fft_zeropad_Bluestein_left_read[3]
char gl_GlobalInvocationID_y[200]
uint64_t fft_zeropad_left_full[3]
uint64_t performWorkGroupShift[3]
uint64_t fft_zeropad_right_full[3]